**Build a model to predict the value of the question in the TV game show  “Jeopardy!”.**


Data description 
* > 'category' : the question category, e.g. "HISTORY" 
* > ‘value' : $ value of the question as string, e.g. "$200" (Note - 
* "None" for Final Jeopardy! and Tiebreaker questions) 
* > 'question' : text of question (Note: This sometimes contains  
* hyperlinks and other things messy text such as when there's a  
* picture or video question) 
* > 'answer' : text of answer 
* > round' : one of "Jeopardy!","Double Jeopardy!","Final Jeopardy!"  or "Tiebreaker" (Note: Tiebreaker questions do happen but  
* they're very rare (like once every 20 years)) 
* > 'show_number' : string of show number, e.g '4680' 
* > 'air_date' : the show air date in format YYYY-MM-DD 


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing necessary libraries**

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

from stop_words import get_stop_words
from nltk.corpus import stopwords

**Reading Csv**

In [None]:
jeo_df = pd.read_csv(file_path)
print(f"Shape of jeo_df is :- {jeo_df.shape}")
jeo_df.head()

In [None]:
# Name of the columns
print(f"Column names:- {jeo_df.columns}")

In [None]:
jeo_df.isna().count()

In [None]:
#Renaming the columns names since it contains spaces for few col names
jeo_df.columns = ["Show_Number", "Air_Date", "Round", "Category", "Value", "Question", "Answer"] 

In [None]:
%matplotlib inline
# Function to draw bar plot
def get_catogorical_features_plot(feat_name, plot=True):
    if plot:
        jeo_df.groupby(feat_name).size().plot(kind = 'bar')
        rows = jeo_df.shape[0]
        res = jeo_df[feat_name].value_counts()/rows 
        print(res)
        print()
    else:
        res = jeo_df[feat_name].value_counts().shape[0]
        print(res)
        print()
    
  
get_catogorical_features_plot("Round") 

In [None]:
# When Round=="Final Jeopardy! or Round=="Tiebreaker" replace that with None
# and removing from the csv since there were very few number of rows containing this data 0.016% and  0.000014%
jeo_df['Value'] = np.where((jeo_df['Round'] == "Final Jeopardy!")
                           | (jeo_df['Round'] == "Tiebreaker"), 
                           "None",      
                           jeo_df['Value'])      
jeo_df = jeo_df[jeo_df['Value'] != "None"]
print(f"Shape of jeo_df is :- {jeo_df.shape}")


In [None]:
# Total unique show number
get_catogorical_features_plot("Show_Number", plot=False) 

In [None]:
# Total Unique Category
get_catogorical_features_plot("Category", plot=False) 

In [None]:
# Total Unique value
get_catogorical_features_plot("Value", plot=False) 

In [None]:
# Checking if any column contains null value or not
jeo_df.isnull().sum()

In [None]:
jeo_df['Answer'].value_counts().to_frame()

In [None]:
# Let's fill the answer with maximun number of repeating answer. Since only 2 of them is missing
jeo_df['Answer'] = jeo_df['Answer'].fillna(jeo_df['Answer'].mode()[0])

In [None]:
jeo_df["Question"].to_list()[:3]

In [None]:
jeo_df["Answer"].to_list()[:20]

> As seen from above question answer, we can conclude that Answer to each question is a Noun

In [None]:
np.where((jeo_df['Round'] == "Jeopardy!")
                           & (jeo_df['Show_Number'] == 4680))[0].shape

In [None]:
# Modifying Value column as numeric and reducing the number of classes to predict
jeo_df['Modified_Value'] = jeo_df['Value'].apply(
    lambda value: int(value.replace(',', '').replace('$', '').replace(" ", ""))
)
def binning(value):
    if value < 1000:
        return np.round(value, -2)
    elif value < 10000:
        return np.round(value, -3)
    else:
        return np.round(value, -4)

jeo_df['Modified_Bins'] = jeo_df['Modified_Value'].apply(binning)

In [None]:
jeo_df.head(5)

In [None]:
jeo_df['Modified_Bins'].value_counts().to_frame()

**Dropping 20000 bins since it has total count less than 3**

In [None]:
jeo_df = jeo_df[jeo_df['Modified_Bins'] != 20000] 

In [None]:
target = jeo_df["Modified_Bins"] 
jeo_df.drop(['Modified_Value', "Value", "Modified_Bins"], axis=1, inplace=True)


In [None]:
# stop_words = list(get_stop_words('en'))         
# nltk_words = list(stopwords.words('english'))
# stop_words.extend(nltk_words)
# stop_words = set(stop_words)
# len(stop_words)

In [None]:
#sort dataframe based on time pandas python: https://stackoverflow.com/a/49702492/4084039
jeo_df["Date"] = pd.to_datetime(jeo_df['Air_Date'])
jeo_df.drop('Air_Date', axis=1, inplace=True)
jeo_df.sort_values(by=['Date'], inplace=True)


In [None]:
jeo_df.head(5)

****Text Preprocessing****

In [None]:
import re
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [None]:
# Combining all the above stundents 
from tqdm import tqdm
preprocessed_question = []
# tqdm is for printing the status bar
for sentance in tqdm(jeo_df['Question'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e not in stopwords)
    preprocessed_question.append(sent.lower().strip())

In [None]:
jeo_df['Preprocessed_Question'] = preprocessed_question    #create new column having name  with preprocessed data
jeo_df.drop(['Question', 'Date'], axis=1, inplace=True) #delete the column
jeo_df.head(2)

In [None]:
# Feature Engineering

def count(line):
    num_text=[]
    for words in line:
        splitted = words.split()
        length = len(splitted)
        num_text.append(length)
    return num_text  

jeo_df['Count_Question'] = count(jeo_df['Preprocessed_Question'])    #create new column having name count_Question with preprocessed data
jeo_df.head(3)

1. Model Training starts

In [None]:
#train_test_split
from sklearn.model_selection import train_test_split
project_data_train, project_data_test, project_data_y_train, project_data_y_test = train_test_split(jeo_df, target, test_size=0.33, stratify=target)
project_data_train, project_data_cv, project_data_y_train, project_data_y_cv = train_test_split(project_data_train, project_data_y_train, test_size=0.33, stratify=project_data_y_train)

In [None]:
# Question
#https://stackoverflow.com/questions/48090658/sklearn-how-to-incorporate-missing-data-when-one-hot-encoding
from sklearn.feature_extraction.text import CountVectorizer
vectorizer3 = CountVectorizer(lowercase=False, binary=True, max_features=2000)
vectorizer3.fit(project_data_train['Preprocessed_Question'].values)
#print(vectorizer3.get_feature_names())

feat_1_train = vectorizer3.transform(project_data_train['Preprocessed_Question'].values)
feat_1_cv = vectorizer3.transform(project_data_cv['Preprocessed_Question'].values)
feat_1_test = vectorizer3.transform(project_data_test['Preprocessed_Question'].values)

print("After vectorizations")
print(feat_1_train.shape, project_data_y_train.shape)
print(feat_1_cv.shape, project_data_y_cv.shape)
print(feat_1_test.shape, project_data_y_test.shape)
print("="*100)


In [None]:
# Round
#https://stackoverflow.com/questions/48090658/sklearn-how-to-incorporate-missing-data-when-one-hot-encoding
from sklearn.feature_extraction.text import CountVectorizer
vectorizer3 = CountVectorizer(lowercase=False, binary=True, max_features=2000)
vectorizer3.fit(project_data_train["Round"].values)
#print(vectorizer3.get_feature_names())

feat_2_train = vectorizer3.transform(project_data_train['Round'].values)
feat_2_cv = vectorizer3.transform(project_data_cv['Round'].values)
feat_2_test = vectorizer3.transform(project_data_test['Round'].values)

print("After vectorizations")
print(feat_2_train.shape, project_data_y_train.shape)
print(feat_2_cv.shape, project_data_y_cv.shape)
print(feat_2_test.shape, project_data_y_test.shape)
print("="*100)


In [None]:
# Category
#https://stackoverflow.com/questions/48090658/sklearn-how-to-incorporate-missing-data-when-one-hot-encoding
from sklearn.feature_extraction.text import CountVectorizer
vectorizer3 = CountVectorizer(lowercase=False, binary=True, max_features=2000)
vectorizer3.fit(project_data_train["Category"].values)
#print(vectorizer3.get_feature_names())

feat_3_train = vectorizer3.transform(project_data_train['Category'].values)
feat_3_cv = vectorizer3.transform(project_data_cv['Category'].values)
feat_3_test = vectorizer3.transform(project_data_test['Category'].values)

print("After vectorizations")
print(feat_3_train.shape, project_data_y_train.shape)
print(feat_3_cv.shape, project_data_y_cv.shape)
print(feat_3_test.shape, project_data_y_test.shape)
print("="*100)


In [None]:
# Answer
#https://stackoverflow.com/questions/48090658/sklearn-how-to-incorporate-missing-data-when-one-hot-encoding
from sklearn.feature_extraction.text import CountVectorizer
vectorizer3 = CountVectorizer(lowercase=False, binary=True, max_features=2000)
vectorizer3.fit(project_data_train["Answer"].values)
#print(vectorizer3.get_feature_names())

feat_4_train = vectorizer3.transform(project_data_train['Answer'].values)
feat_4_cv = vectorizer3.transform(project_data_cv['Answer'].values)
feat_4_test = vectorizer3.transform(project_data_test['Answer'].values)

print("After vectorizations")
print(feat_4_train.shape, project_data_y_train.shape)
print(feat_4_cv.shape, project_data_y_cv.shape)
print(feat_4_test.shape, project_data_y_test.shape)
print("="*100)


In [None]:
# check this one: https://www.youtube.com/watch?v=0HOqOcln3Z4&t=530s
# standardization sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.preprocessing import StandardScaler

# quantity_standardized = standardScalar.fit(project_data['quantity'].values)
# this will rise the error
# ValueError: Expected 2D array, got 1D array instead: array=[725.05 213.03 329.   ... 399.   287.73   5.5 ].
# Reshape your data either using array.reshape(-1, 1)

num_title_scalar = StandardScaler()
num_title_scalar.fit(project_data_train['Count_Question'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
#print(f"Mean : {num_title_scalar.mean_[0]}, Standard deviation : {np.sqrt(num_title_scalar.var_[0])}")

# Now standardize the data with above maen and variance.
feat_5_train = num_title_scalar.transform(project_data_train['Count_Question'].values.reshape(-1, 1))
feat_5_cv = num_title_scalar.transform(project_data_cv['Count_Question'].values.reshape(-1, 1))
feat_5_test = num_title_scalar.transform(project_data_test['Count_Question'].values.reshape(-1, 1))

In [None]:
# check this one: https://www.youtube.com/watch?v=0HOqOcln3Z4&t=530s
# standardization sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.preprocessing import StandardScaler

# quantity_standardized = standardScalar.fit(project_data['quantity'].values)
# this will rise the error
# ValueError: Expected 2D array, got 1D array instead: array=[725.05 213.03 329.   ... 399.   287.73   5.5 ].
# Reshape your data either using array.reshape(-1, 1)

num_title_scalar = StandardScaler()
num_title_scalar.fit(project_data_train['Show_Number'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
#print(f"Mean : {num_title_scalar.mean_[0]}, Standard deviation : {np.sqrt(num_title_scalar.var_[0])}")

# Now standardize the data with above maen and variance.
feat_6_train = num_title_scalar.transform(project_data_train['Show_Number'].values.reshape(-1, 1))
feat_6_cv = num_title_scalar.transform(project_data_cv['Show_Number'].values.reshape(-1, 1))
feat_6_test = num_title_scalar.transform(project_data_test['Show_Number'].values.reshape(-1, 1))

In [None]:
# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
from scipy.sparse import hstack

# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
lr_train_1=hstack((feat_1_train, feat_2_train, feat_3_train, feat_4_train, feat_5_train, feat_6_train)).tocsr()
lr_cv_1=hstack((feat_1_cv, feat_2_cv, feat_3_cv, feat_4_cv, feat_5_cv, feat_6_cv)).tocsr()
lr_test_1=hstack((feat_1_test, feat_2_test, feat_3_test, feat_4_test, feat_5_test, feat_6_test)).tocsr()


**Different models**

In [None]:
import random as r
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
alpha = [10 ** x for x in range(-3, 3)]
cv_log_error_array=[]
for i in alpha:
    logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced')
    logisticR.fit(lr_train_1,project_data_y_train)
    sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
    sig_clf.fit(lr_train_1, project_data_y_train)
    predict_y = sig_clf.predict_proba(lr_cv_1)
    cv_log_error_array.append(log_loss(project_data_y_cv, predict_y, labels=logisticR.classes_, eps=1e-15))
    print(f"Done alpha== {i}")
    
for i in range(len(cv_log_error_array)):
    print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])


In [None]:
import matplotlib.pyplot as plt
best_alpha = np.argmin(cv_log_error_array)

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

logisticR=LogisticRegression(penalty='l2',C=alpha[best_alpha],class_weight='balanced')
logisticR.fit(lr_train_1,project_data_y_train)
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
sig_clf.fit(lr_train_1, project_data_y_train)
pred_y=sig_clf.predict(lr_test_1)

predict_y = sig_clf.predict_proba(lr_train_1)
print ('log loss for train data',log_loss(project_data_y_train, predict_y, labels=logisticR.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(lr_cv_1)
print ('log loss for cv data',log_loss(project_data_y_cv, predict_y, labels=logisticR.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(lr_test_1)
print ('log loss for test data',log_loss(project_data_y_test, predict_y, labels=logisticR.classes_, eps=1e-15))

In [None]:
alpha=[5,10,50]
cv_log_error_array=[]
for i in alpha:
    x_cfl=XGBClassifier(n_estimators=i,nthread=-1)
    x_cfl.fit(lr_train_1,project_data_y_train)
    sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
    sig_clf.fit(lr_train_1, project_data_y_train)
    predict_y = sig_clf.predict_proba(lr_cv_1)
    cv_log_error_array.append(log_loss(project_data_y_cv, predict_y, labels=logisticR.classes_, eps=1e-15))
    print(f"Done alpha== {i}")
    
for i in range(len(cv_log_error_array)):
    print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])


In [None]:
import matplotlib.pyplot as plt
best_alpha = np.argmin(cv_log_error_array)

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

x_cfl=XGBClassifier(n_estimators=i,nthread=-1)
x_cfl.fit(lr_train_1,project_data_y_train)
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
sig_clf.fit(lr_train_1, project_data_y_train)
pred_y=sig_clf.predict(lr_test_1)

predict_y = sig_clf.predict_proba(lr_train_1)
print ('log loss for train data',log_loss(project_data_y_train, predict_y, labels=logisticR.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(lr_cv_1)
print ('log loss for cv data',log_loss(project_data_y_cv, predict_y, labels=logisticR.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(lr_test_1)
print ('log loss for test data',log_loss(project_data_y_test, predict_y, labels=logisticR.classes_, eps=1e-15))

**Note:**
1. We can also use different vectorizer other than count vectorizer. Ex:- (Tfidf, w2v etc)
2. Training time of w2v will be much greater than count and tfidf vectorizer.
3. We can also try RandomForect, knn, NeuralNetwork.
4. Loss is logloss for multiclass classification.
5. We can also tweak number of alpha's, to check robustness of above model.
6. Divided train, cv and test and then vectorize it to avoid data leakage(Increase model robustness)

**If you like this work then give a upvote**