In [1]:
#importing required packages
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize

from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [2]:
#reading the data
data = pd.read_csv("JEOPARDY_CSV.csv")
#making a copy of the data so to leave the original data untouched
data_copy = data.copy()

In [3]:
print("Shape of data is ",data_copy.shape)

#since all data having Round as Final Jeopardy have value None
print("Data having value as None", data_copy[data_copy[' Round'] == 'Final Jeopardy!'].shape) 

#we can remove those data points
data_copy = data_copy[data_copy[' Round'] != 'Final Jeopardy!']
print("New Data without 'None' in it as value", data_copy.shape) 

Shape of data is  (216930, 7)
Data having value as None (3631, 7)
New Data without 'None' in it as value (213299, 7)


In [4]:
data_copy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
data_copy.describe

<bound method NDFrame.describe of         Show Number    Air Date             Round  \
0              4680  2004-12-31         Jeopardy!   
1              4680  2004-12-31         Jeopardy!   
2              4680  2004-12-31         Jeopardy!   
3              4680  2004-12-31         Jeopardy!   
4              4680  2004-12-31         Jeopardy!   
...             ...         ...               ...   
216924         4999  2006-05-11  Double Jeopardy!   
216925         4999  2006-05-11  Double Jeopardy!   
216926         4999  2006-05-11  Double Jeopardy!   
216927         4999  2006-05-11  Double Jeopardy!   
216928         4999  2006-05-11  Double Jeopardy!   

                               Category  Value  \
0                               HISTORY   $200   
1       ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2           EVERYBODY TALKS ABOUT IT...   $200   
3                      THE COMPANY LINE   $200   
4                   EPITAPHS & TRIBUTES   $200   
...                          

In [6]:
lemmatizer = WordNetLemmatizer()#lemmatizer

ps = PorterStemmer()#stemming

stop_words = set(stopwords.words('english'))

def PreprocessingText(text):
    """
    input : text
    output : preprcossed text version
    """
    text = str(text)
    text = text.lower() #lowercase text

    text = re.sub(r"<.*?>","", text) # removes anything enclosed between html tag <>

    text = re.sub(r"http\S+", "", text) #removes hhtp / https links

    text  = text.translate(str.maketrans('', '', string.punctuation))#removes punctuation
    text = text.lower() #lowercase text

    word_tokens = word_tokenize(text)#tokenize words
    filtered_sentence = [lemmatizer.lemmatize(ps.stem(w)) for w in word_tokens if not w in stop_words]#stem the text, followed by lemmatization

    return " ".join(filtered_sentence)#concatenates list to string
    

In [7]:
data_copy[' Question'] = data_copy[' Question'].apply(PreprocessingText)#apply preprocessing over Question columns

In [8]:
#merge columns Round, Category, Question into a new columns called concatenate
data_copy['concatenate'] = data_copy[' Round'] + " " + data_copy[' Category'] + " " + data_copy[' Question']

In [9]:
data_copy

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,concatenate
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,last 8 year life galileo hous arrest espous ma...,Copernicus,Jeopardy! HISTORY last 8 year life galileo hou...
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,2 1912 olympian footbal star carlisl indian sc...,Jim Thorpe,Jeopardy! ESPN's TOP 10 ALL-TIME ATHLETES 2 19...
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,citi yuma state record averag 4055 hour sunshi...,Arizona,Jeopardy! EVERYBODY TALKS ABOUT IT... citi yum...
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,1963 live art linklett show compani serv billi...,McDonald's,Jeopardy! THE COMPANY LINE 1963 live art linkl...
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,signer dec indep framer constitut mass second ...,John Adams,Jeopardy! EPITAPHS & TRIBUTES signer dec indep...
...,...,...,...,...,...,...,...,...
216924,4999,2006-05-11,Double Jeopardy!,OFF-BROADWAY,$2000,2006 cast longrun hit embark exuber noisi camp...,Stomp,Double Jeopardy! OFF-BROADWAY 2006 cast longru...
216925,4999,2006-05-11,Double Jeopardy!,RIDDLE ME THIS,$2000,puccini opera turn solut 3 riddl pose heroin,Turandot,Double Jeopardy! RIDDLE ME THIS puccini opera ...
216926,4999,2006-05-11,Double Jeopardy!,"""T"" BIRDS",$2000,north america term properli appli 4 speci cres...,a titmouse,"Double Jeopardy! ""T"" BIRDS north america term ..."
216927,4999,2006-05-11,Double Jeopardy!,AUTHORS IN THEIR YOUTH,$2000,penni lane hellrais grew barber shave anoth cu...,Clive Barker,Double Jeopardy! AUTHORS IN THEIR YOUTH penni ...


In [10]:
#Using TfidfVectorizer to turn the string into a set of integers
#Alternatively Count Vectorizer can also be used.
#I went with TFIDF cause the stop words have already been removed, and tfidf can perform better
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(data_copy['concatenate'])#fit tfidf on the entire column
Y = data_copy[' Value']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)#splitting into train-test with 25 % as testsize

In [12]:
print("Training shape ",X_train.shape)
print("Testing shape ",X_test.shape)

Training shape  (159974, 86572)
Testing shape  (53325, 86572)


In [15]:
#use default model

model = LinearSVC()
model.fit(X_train, y_train)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [16]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

Prediction = model.predict(X_test)
acc_score = model.score(X_test, y_test)
c_matrix = confusion_matrix(y_test, Prediction)

precision, recall, f1_score, support = precision_recall_fscore_support(y_test, Prediction)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
print("Model score", acc_score)
print("Model confusion matrix", c_matrix)
print("Model precision", precision)
print("Model recall", recall)
print("Model f1_score ", f1_score)
print("Model support", support)

Model score 0.13819034224097515
Model confusion matrix [[7 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Model precision [0.05185185 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.09896433 0.10164384 0.         0.         0.08569016
 0.         0.06807728 0.01149425 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.17152822 0.0985342  0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.09025507 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.17784722 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.11665732
 0.         0.         0.         0.         0.         0.
 0.         0.08432371 0. 

In [18]:
#to deploy model via flask server
from flask import Flask
from flask import request
app = Flask(__name__)

@app.route('/QuestionWorth', methods=['POST'])
def question():
    if request.method == 'POST':
        #get the required element off JSON 
        question = request.args.get('question', default=0, type=int)
        Round = request.args.get('Round', default=0, type=int)
        Category = request.args.get('Category', default=0, type=int)
    
        #preprocess
        question = PreprocessingText(question)
        
        #concatenate
        concatenate = Round + " " + Category + " " + question
        
        #predict
        return ("Value for the question is",model.predict(concatenate))

In [None]:
"""Use pretrained model ELMo with tensorflow hub"""

path = r"C:\Users\s\Desktop\Wysa\elmo_3"
Module = hub.load(path)
def encoderElmo(words, batch_size = 1000):
    
    ElmoEncoded = []
    start_batch_size = 0# starts from 0
    
    for i in range(len(words) % batch_size ):
        end_batch_size = start_batch_size + batch_size #1000 is batch size, so 1000 textsgo to elmo each time
        
        temp = words[start_batch_size : end_batch_size]
        
        ElmoEncoded.append(Module(np.array(temp)))#each run 1000 gets admitted to elmoencoded list
        
        start_batch_size += 1000
        
    return ElmoEncoded
        

data_copy['concatenate'] = encoderElmo(data_copy['concatenate'])#send the questions in


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential(
    [
        layers.Dense(1024, activation="relu", name="layer1"),#1024 is the output dimension for ELMO
        layers.Dense(512, activation="relu", name="layer2"),
        layers.Dense(150, activation="softmax", name="layer3"),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(data_copy['concatenate'], Y, test_size=0.25)#split the data

model.fit(X_train, X_test)

Prediction = model.predict(X_test)
acc_score = model.score(X_test, y_test)
c_matrix = confusion_matrix(y_test, Prediction)

precision, recall, f1_score, support = precision_recall_fscore_support(y_test, Prediction)