In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt

### Exploratory Data Analysis

In [None]:
df = pd.read_csv("/kaggle/input/200000-jeopardy-questions/JEOPARDY_CSV.csv")

In [None]:
df.head()

In [None]:
df.info()

checking unique values in the Round column, there are four unique values Jeopardy, Final Jeopardy, Double Jeopardy, Tiebreaker.

In [None]:
df[' Round'].unique()

Lets see the countplot for each of these Round Type, we can clearly see that majority of the samples are from Jeopardy or Double Jeopardy, and very few from Final Jeopardy and almost negligible from Tiebreaker

In [None]:
sns.countplot(x=' Round',data=df)

The samples from category Jeopardy are 107384

In [None]:
df[df[' Round'] == 'Jeopardy!'].count()

The samples from category Double Jeopardy are 105912

In [None]:
df[df[' Round'] == 'Double Jeopardy!'].count()

The samples from category Final Jeopardy are 3631

In [None]:
df[df[' Round'] == 'Final Jeopardy!'].count()

The samples from category Tiebreaker are only 3

In [None]:
df[df[' Round'] == 'Tiebreaker'].count()

There are 27995 different unique categories in the 'Category' column

In [None]:
df[' Category'].nunique()

checking if there are any null values

In [None]:
df.isnull().sum()

There are no null values but some values in the 'Value' column are filled with the string 'None'.

In [None]:
df[df[' Value'] == 'None']

Dropping rows containing 'None' Values.

In [None]:
df.drop(df[df[' Value'] == 'None'].index,inplace=True)

The 'Value' column has the string of the value which also contains $ sign and columns, so removing the signs and converting the string to interger value in a new column 'ValueNum'

In [None]:
df['ValueNum'] = df[' Value'].apply(
    lambda value: int(value.replace('$', '').replace(',','')))

In [None]:
df['ValueNum'].head()

There are 145 unique values in the ValueNum column so it makes a lot of different categories to classify

In [None]:
df['ValueNum'].nunique()

Binning the values if the value is smaller than 1000, then we round to the nearest hundred. Otherwise, if it's between 1000 and 10k, we round it to nearest thousand. If it's greater than 10k, then we round it to the nearest 10-thousand.

In [None]:
def binning(value):
    if value < 1000:
        return np.round(value, -2)
    elif value < 10000:
        return np.round(value, -3)
    else:
        return np.round(value, -4)

df['ValueBins'] = df['ValueNum'].apply(binning)

So, Now we have 21 different values to classify instead of 145

In [None]:
df['ValueBins'].nunique()

Lets just take a look at the 'Question' Column.

In [None]:
df[' Question']

In [None]:
comment_words = '' 
stopwords = set(STOPWORDS) 

for val in df[' Question']: 
      
    val = str(val) 
    tokens = val.split() 

    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
                     
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

### Building a Random Forest Model
Since the data is huge, but for our convenience lets take 10,000 random samples from the dataframe

In [None]:
df_sample = df.sample(n=10000)

In [None]:
X = df_sample[' Question']
y = df_sample['ValueBins']

We will use a Random Forest Classifier model with Grid Searching for finding the best hyperparameters from our dictionary of parameters

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

RFC=RandomForestClassifier(max_features="sqrt")
parameters={ "max_depth":[5,8,25], 
             "min_samples_split":[1,2,5], "n_estimators":[800,1200]}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(RFC, parameters)
from sklearn.model_selection import train_test_split


In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
X = tfidf.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=1)

In [None]:
clf.fit(X_train,y_train)

In [None]:
print(clf.cv_results_['params'])

In [None]:
print(clf.cv_results_['rank_test_score'])

We can see that these are the best parameters from the parameter dict

In [None]:
print(clf.cv_results_['params'][-2])

So this is how we get the best parameters and we can use these parameters to train over the complete data to get the best results!