<a href="https://colab.research.google.com/github/robertccruz13-ship-it/sds-510-robert-cruz/blob/main/module_5_essentials.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

uploaded = files.upload()

Saving jeopardy.json to jeopardy.json


In [2]:
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,classification_report

In [3]:
with open ("jeopardy.json", "r") as f:
  data = json.load(f)

df = pd.DataFrame(data)
df.head(), df.columns

(                          category    air_date  ...      round show_number
 0                          HISTORY  2004-12-31  ...  Jeopardy!        4680
 1  ESPN's TOP 10 ALL-TIME ATHLETES  2004-12-31  ...  Jeopardy!        4680
 2      EVERYBODY TALKS ABOUT IT...  2004-12-31  ...  Jeopardy!        4680
 3                 THE COMPANY LINE  2004-12-31  ...  Jeopardy!        4680
 4              EPITAPHS & TRIBUTES  2004-12-31  ...  Jeopardy!        4680
 
 [5 rows x 7 columns],
 Index(['category', 'air_date', 'question', 'value', 'answer', 'round',
        'show_number'],
       dtype='object'))

In [4]:
df = df[['question', 'value']].copy()

df = df.dropna(subset=['question', 'value'])

df['value_clean'] = (
      df['value']
      .astype(str)
      .str.replace('[\$,]', '', regex=True)
)

df = df[(df['value_clean']!= 'None')& (df['value_clean']!= '')]

df['value_clean'] = df['value_clean'].astype(int)

df[['question', 'value', 'value_clean']].head()

  .str.replace('[\$,]', '', regex=True)


Unnamed: 0,question,value,value_clean
0,"'For the last 8 years of his life, Galileo was...",$200,200
1,'No. 2: 1912 Olympian; football star at Carlis...,$200,200
2,'The city of Yuma in this state has a record a...,$200,200
3,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,200
4,"'Signer of the Dec. of Indep., framer of the C...",$200,200


In [5]:
threshold = 800
df['label'] = (df['value_clean'] > threshold).astype(int)

df ['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,151874
1,61422


In [6]:
X = df['question']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

len(X_train), len(X_test)

(170636, 42660)

In [7]:
def train_and_evaluate(model, X_train_vec, X_test_vec, y_train, y_test, name="Model"):
  model.fit(X_train_vec, y_train)
  y_pred = model.predict(X_test_vec)
  print(f"=== {name} ===")
  print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
  print(classification_report(y_test, y_pred))
  print()
  return model

In [8]:
count_vec = CountVectorizer(stop_words='english')
X_train_counts = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)

nb = MultinomialNB()
nb = train_and_evaluate(nb, X_train_counts, X_test_count, y_train, y_test, "Naive Bayes + CountVectorizer")

=== Naive Bayes + CountVectorizer ===
Accuracy: 0.693413033286451
              precision    recall  f1-score   support

           0       0.73      0.91      0.81     30375
           1       0.42      0.17      0.24     12285

    accuracy                           0.69     42660
   macro avg       0.57      0.54      0.52     42660
weighted avg       0.64      0.69      0.64     42660




In [9]:
tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
log_reg = train_and_evaluate(log_reg, X_train_tfidf, X_test_tfidf, y_train, y_test, "Logistic Regression + TF-IDF")

=== Logistic Regression + TF-IDF ===
Accuracy: 0.7097046413502109
              precision    recall  f1-score   support

           0       0.72      0.96      0.83     30375
           1       0.48      0.08      0.14     12285

    accuracy                           0.71     42660
   macro avg       0.60      0.52      0.48     42660
weighted avg       0.65      0.71      0.63     42660




In [11]:
tfidf_ngrams = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_tfidf_ngrams = tfidf_ngrams.fit_transform(X_train)
X_test_tfidf_ngrams = tfidf_ngrams.transform(X_test)

svm = LinearSVC()
svm = train_and_evaluate(svm, X_train_tfidf_ngrams, X_test_tfidf_ngrams, y_train, y_test, name ="Linear SVM + TF-IDF (1-2 grams)")

=== Linear SVM + TF-IDF (1-2 grams) ===
Accuracy: 0.68314580403188
              precision    recall  f1-score   support

           0       0.74      0.86      0.80     30375
           1       0.41      0.24      0.30     12285

    accuracy                           0.68     42660
   macro avg       0.57      0.55      0.55     42660
weighted avg       0.64      0.68      0.65     42660


