In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalMaxPooling1D, LSTM, Bidirectional, Embedding, Dropout

In [None]:
df = pd.read_csv("/kaggle/input/200000-jeopardy-questions/JEOPARDY_CSV.csv")
df = df[df[' Value'] != 'None']

In [None]:
display(df.head(3))
display(df.tail(3))

In [None]:
print("Dataset has " + str(df.shape[0]) + " rows \nAnd " + str(df.shape[1]) + " columns")
print("\nFeatures :\n", df.columns.tolist())
print("\nMissing Values :", df.isnull().values.sum())

In [None]:
df.info()

In [None]:
df.isnull().sum()

## Answer

In [None]:
df[' Answer'].value_counts().to_frame()

In [None]:
df[' Answer'] = df[' Answer'].fillna(df[' Answer'].mode()[0])

## Air Date

In [None]:
df[' Air Date'] = pd.to_datetime(df[' Air Date'])

## Round

In [None]:
df[' Round'].value_counts().to_frame()

## Category

In [None]:
df[' Category'].value_counts().to_frame()

## Answer

In [None]:
df[' Answer'].value_counts().to_frame()

## Value

In [None]:
df['ValueNum'] = df[' Value'].apply(
    lambda value: int(value.replace(',', '').replace('$', ''))
)

In [None]:
def binning(value):
    if value < 1000:
        return np.round(value, -2)
    elif value < 10000:
        return np.round(value, -3)
    else:
        return np.round(value, -4)

df['ValueBins'] = df['ValueNum'].apply(binning)

In [None]:
print("Total number of categories:", df[' Value'].unique().shape[0])
print("Number of categories after binning:", df['ValueBins'].unique().shape[0])
print("\nBinned Categories:", df['ValueBins'].unique())

In [None]:
show_numbers = df['Show Number'].unique()
train_shows, test_shows = train_test_split(show_numbers, test_size=0.2, random_state=2019)

train_mask = df['Show Number'].isin(train_shows)
test_mask = df['Show Number'].isin(test_shows)

train_labels = df.loc[train_mask, 'ValueBins']
train_questions = df.loc[train_mask, ' Question']
test_labels = df.loc[test_mask, 'ValueBins']
test_questions = df.loc[test_mask, ' Question']

## Simple Linear Model

In [None]:
%%time
bow = CountVectorizer(stop_words='english', max_features=2000)
bow.fit(df[' Question'])

In [None]:
X_train = bow.transform(train_questions)
X_test = bow.transform(test_questions)

y_train = train_labels
y_test = test_labels

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

###  Logistic Regression

In [None]:
%%time
lr = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=200)
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

print(classification_report(y_test, y_pred))

## Tokenization

In [None]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(df[' Question'])

train_sequence = tokenizer.texts_to_sequences(train_questions)
test_sequence = tokenizer.texts_to_sequences(test_questions)

print("Original text:", train_questions[0])
print("Converted sequence:", train_sequence[0])

In [None]:
X_train = pad_sequences(train_sequence, maxlen=50)
X_test = pad_sequences(test_sequence, maxlen=50)

print(X_train.shape)
print(X_test.shape)

In [None]:
le = LabelEncoder()
le.fit(df['ValueBins'])

y_train = le.transform(train_labels)
y_test = le.transform(test_labels)

print(y_train.shape)
print(y_test.shape)

In [None]:
num_words = tokenizer.num_words
output_size = len(le.classes_)

In [None]:
model = Sequential([
    Embedding(input_dim=num_words, 
              output_dim=200, 
              mask_zero=True, 
              input_length=50),
    Bidirectional(LSTM(150, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(300, activation='relu'),
    Dropout(0.5),
    Dense(output_size, activation='softmax')
    
])

model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=1024, validation_split=0.1)

In [None]:
y_pred = model.predict(X_test, batch_size=1024).argmax(axis=1)
print(classification_report(y_test, y_pred))