In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
np.random.seed(2021)
import matplotlib.pyplot as plt
import warnings
import pprint
# warnings.simplefilter("ignore")

In [None]:
questions_dataset = pd.read_csv("/kaggle/input/200000-jeopardy-questions/JEOPARDY_CSV.csv")

In [None]:
questions_dataset.head()

Renaming Columns to remove trailing whitespace and bring down to lower case

In [None]:
questions_dataset.columns = [x.strip().lower() for x in questions_dataset.columns]
questions_dataset.head()

In [None]:
questions_dataset['round'].value_counts()

In [None]:
## Visualization Utility Functions
from wordcloud import WordCloud
import matplotlib.colors as mcolors

# No_of ques per Round distribution
def plot_dist(data, title, xlabel, ylabel):
    x_pos = np.arange(len(data.keys()))
    plt.figure(figsize=[12,8])
    plt.bar(x_pos, data.values(), color = "blue")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    plt.xticks(x_pos, data.keys())
    
    plt.show()

Distribution of the Questions accross the Rounds, Since rounds Final Jeopardy and tiebraker rounds got the less number of questions compared to others,and the value is None they're ignored for the further purpose.

In [None]:
plot_dist(questions_dataset["round"].value_counts().to_dict(),
               'Number of questions in each Round',
               'Rounds',
               'Number of Questions'
              )

In [None]:
questions_dataset.drop(questions_dataset[questions_dataset['value'] == "None"].index, inplace= True)

+ Convert the entries of the value column to int from string.
+ The variables of the value column are rounded to the nearest whole number and groupped as bins which makes the target column for the rest of the problem.

In [None]:
## Utility Functions.

def process_value(value):
    value = value.strip('$')
    value = ''.join(value.split(','))
    return int(value)

def binning(value):
    if value < 1000:
        return np.round(value, -2)
    elif value < 10000:
        return np.round(value, -3)
    else:
        return np.round(value, -4)


In [None]:
questions_dataset['num_value'] = questions_dataset['value'].apply(process_value)
questions_dataset.num_value.nunique()

In [None]:
questions_dataset['value_bins'] = questions_dataset['num_value'].apply(binning)
questions_dataset.value_bins.nunique()

In [None]:
questions_dataset.head()

#### Preprocess the Questions column

In [None]:
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words = set(stopwords.words('english'))
porter_stemmer = PorterStemmer()
def preprocess_text(question):
    question = question.lower()
    question = re.sub("[^a-z A-Z]", ' ', question)
    question = " ".join([porter_stemmer.stem(word) for word in question.split(' ') if not word in stop_words and word != ''])
    return question

In [None]:
processed_question = questions_dataset['question'].apply(preprocess_text)

In [None]:
processed_question.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
tf_idf = TfidfVectorizer()

In [None]:
X = tf_idf.fit_transform(processed_question)
Y = questions_dataset.value_bins

Distribution of the Classes/Bins 

In [None]:
plot_dist(questions_dataset["value_bins"].value_counts().to_dict(),
               'Number of questions in each Value Bin',
               'Value Bins',
               'Number of Questions'
              )

In [None]:
class_weights = questions_dataset.value_bins.value_counts().to_dict()
pprint.pprint(class_weights)

In [None]:
X

In [None]:
Y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                   test_size= 0.30,
                                                   random_state= 1,
                                                   stratify= Y)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# RFC=RandomForestClassifier(max_features="sqrt")
# parameters={ "max_depth":[5,25], 
#              "min_samples_split":[1,5], "n_estimators":[800,1200]}
# from sklearn.model_selection import GridSearchCV
# clf = GridSearchCV(RFC, parameters)

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state= 1, class_weight= class_weights)

In [None]:
log_reg

In [None]:
y_train.unique()

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
y_pred_logreg = log_reg.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(max_depth= 25,
                             min_samples_split= 8,
                             n_estimators = 800,
                             max_features= 'sqrt',
                             class_weight= class_weights
                            )

In [None]:
RFC

In [None]:
RFC.fit(X_train,y_train)

In [None]:
y_pred_rfc = RFC.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
# cm = confusion_matrix(y_true= y_test, y_pred= y_pred)
lr_accuracy = accuracy_score(y_true= y_test, y_pred= y_pred_logreg)
rf_accuracy = accuracy_score(y_true= y_test, y_pred= y_pred_rfc)
# cls_report = classification_report(y_true= y_test, y_pred= y_pred)

In [None]:
print(lr_accuracy)

In [None]:
print(rf_accuracy)