## Random Forest

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
from textblob import TextBlob
from wordcloud import WordCloud

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.feature_extraction import text
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [5]:
df = pd.read_csv("combined_season1-37.tsv.zip", delimiter='\t')

In [6]:
df

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes
0,1,100,no,LAKES & RIVERS,-,River mentioned most often in the Bible,the Jordan,1984-09-10,-
1,1,200,no,LAKES & RIVERS,-,Scottish word for lake,loch,1984-09-10,-
2,1,800,yes,LAKES & RIVERS,-,River in this famous song:,the Volga River,1984-09-10,-
3,1,400,no,LAKES & RIVERS,-,American river only 33 miles shorter than the ...,the Missouri,1984-09-10,-
4,1,500,no,LAKES & RIVERS,-,"World's largest lake, nearly 5 times as big as...",the Caspian Sea,1984-09-10,-
...,...,...,...,...,...,...,...,...,...
389440,2,400,no,FOUNDRY,-,This hefty noisemaker from Whitechapel Foundry...,Big Ben,2021-08-13,-
389441,2,800,no,FOUNDRY,-,"Around 4,000 years ago, the first foundries in...",bronze,2021-08-13,-
389442,2,1200,no,FOUNDRY,-,Several different foundries worked for 4 month...,Monitor,2021-08-13,-
389443,2,1600,no,FOUNDRY,-,"Once one of the largest of its kind, the Gary ...",U.S. Steel,2021-08-13,-


In [7]:
df['air_date'] = pd.to_datetime(df['air_date'])

In [8]:
#On 11/26/2001, the values for the questions doubled for both rounds of Jeopardy. Need to adjust the earlier episodes to have the same values as post-11/26/2001 shows.
df.loc[df['air_date'] < '2001-11-26', "value"] = df.value * 2

In [9]:
#remove Daily Doubles since the contestants can wager any amounts for those
df = df[df["daily_double"] != 'yes']

In [10]:
#keep only standard values (this will remove Final Jeopardy questions, which do not have a set amount and set are at '0', as well as the handful of non-standard values that are likely typos)
df = df.loc[df['value'].isin([200, 400, 600, 800, 1000, 400, 800, 1200, 1600, 2000])]

## NLP Data Cleaning

In [11]:
# import string to remove punctuation

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def no_punctuation(text):
    nopunct=[words for words in text if words not in string.punctuation]
    words_without_punct=''.join(nopunct)
    return words_without_punct

In [13]:
#remove punctuation and lowercase words in 'category,' 'answer,' and 'question'

df['category'] = df['category'].apply(lambda x: no_punctuation(x).lower())
df['answer'] = df['answer'].apply(lambda x: no_punctuation(x).lower())
df['question'] = df['question'].apply(lambda x: no_punctuation(x).lower())

In [14]:
#remove numerals from 'category,' 'answer,' and 'question'

df['category'] = df['category'].str.replace('\d+', '')
df['answer'] = df['answer'].str.replace('\d+', '')
df['question'] = df['question'].str.replace('\d+', '')

In [15]:
df

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes
0,1,200,no,lakes rivers,-,river mentioned most often in the bible,the jordan,1984-09-10,-
1,1,400,no,lakes rivers,-,scottish word for lake,loch,1984-09-10,-
3,1,800,no,lakes rivers,-,american river only miles shorter than the mi...,the missouri,1984-09-10,-
4,1,1000,no,lakes rivers,-,worlds largest lake nearly times as big as su...,the caspian sea,1984-09-10,-
5,1,200,no,inventions,-,marconis wonderful wireless,the radio,1984-09-10,-
...,...,...,...,...,...,...,...,...,...
389438,2,1600,no,lost,-,in a moveable feast gertrude stein is quoted a...,lost generation,2021-08-13,-
389440,2,400,no,foundry,-,this hefty noisemaker from whitechapel foundry...,big ben,2021-08-13,-
389441,2,800,no,foundry,-,around years ago the first foundries in mesop...,bronze,2021-08-13,-
389442,2,1200,no,foundry,-,several different foundries worked for months...,monitor,2021-08-13,-


## Random Forest

In [16]:
#Use 'answer' (Jeopardy questions) as the feature and 'value' as the target

X = df['answer']
y = df['value']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state=54)

In [18]:
#Instantiate the vector with stop words, max_features=1000, and bigrams
tfidf = TfidfVectorizer(stop_words='english', ngram_range= (1, 2), max_features=1000)

# Fit the vectorizer on X_train and transform it
X_train_vectorized = tfidf.fit_transform(X_train)

In [19]:
print("Shape of X_train_vectorized:", X_train_vectorized.shape)

Shape of X_train_vectorized: (272823, 1000)


In [34]:
#Basic random forest
random_forest = RandomForestRegressor(max_depth=2)
random_forest_cv = cross_val_score(random_forest, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
random_forest_cv

array([-532.22380864, -532.59566592])

In [25]:
print("Basic Random Forest:", -(random_forest_cv.mean()))

Basic Random Forest: 532.4167569768802


In [21]:
#Use Grid Search to find best parameters for random forest
random_forest_model = RandomForestRegressor()

rf_grid_param1 = {'max_depth': [5, 10, 15]}

rf_grid1 = GridSearchCV(random_forest_model, rf_grid_param1, cv=2)

rf_grid1.fit(X_train_vectorized, y_train)

GridSearchCV(cv=2, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [5, 10, 15]})

In [22]:
rf_grid1.best_params_

{'max_depth': 15}

In [23]:
#Use Grid Search to find best parameters for random forest
random_forest_model = RandomForestRegressor(max_depth=15)

rf_grid_param2 = {'min_samples_split': [2, 4, 6]}

rf_grid2 = GridSearchCV(random_forest_model, rf_grid_param2, cv=2)

rf_grid2.fit(X_train_vectorized, y_train)

GridSearchCV(cv=2, estimator=RandomForestRegressor(max_depth=15),
             param_grid={'min_samples_split': [2, 4, 6]})

In [24]:
rf_grid2.best_params_

{'min_samples_split': 4}

In [20]:
#Use Grid Search to find best parameters for random forest
random_forest_model = RandomForestRegressor(max_depth=15, min_samples_split=4)

rf_grid_param3 = {'min_samples_leaf': [1,2,4]}

rf_grid3 = GridSearchCV(random_forest_model, rf_grid_param3, cv=2)

rf_grid3.fit(X_train_vectorized, y_train)

GridSearchCV(cv=2,
             estimator=RandomForestRegressor(max_depth=15, min_samples_split=4),
             param_grid={'min_samples_leaf': [1, 2, 4]})

In [21]:
rf_grid3.best_params_

{'min_samples_leaf': 1}

In [22]:
#Use Grid Search to find best parameters for random forest
random_forest_model = RandomForestRegressor(max_depth=15, min_samples_split=4, min_samples_leaf=1)

rf_grid_param4 = {'max_features': ["auto", "sqrt", "log2"]}

rf_grid4 = GridSearchCV(random_forest_model, rf_grid_param4, cv=2)

rf_grid4.fit(X_train_vectorized, y_train)

GridSearchCV(cv=2,
             estimator=RandomForestRegressor(max_depth=15, min_samples_split=4),
             param_grid={'max_features': ['auto', 'sqrt', 'log2']})

In [23]:
rf_grid4.best_params_

{'max_features': 'auto'}

In [24]:
#Use Grid Search to find best parameters for random forest
random_forest_model = RandomForestRegressor(max_depth=15, min_samples_split=4, min_samples_leaf=1, max_features='auto')

rf_grid_param5 = {'bootstrap': [True, False]}

rf_grid5 = GridSearchCV(random_forest_model, rf_grid_param5, cv=2)

rf_grid5.fit(X_train_vectorized, y_train)

GridSearchCV(cv=2,
             estimator=RandomForestRegressor(max_depth=15, min_samples_split=4),
             param_grid={'bootstrap': [True, False]})

In [25]:
rf_grid5.best_params_

{'bootstrap': True}

In [26]:
#Use Grid Search to find best parameters for random forest
random_forest_model = RandomForestRegressor(max_depth=15, min_samples_split=4, min_samples_leaf=1, max_features='auto', bootstrap=True)

rf_grid_param6 = {'n_estimators': [5,20,50,100]}

rf_grid6 = GridSearchCV(random_forest_model, rf_grid_param6, cv=2)

rf_grid6.fit(X_train_vectorized, y_train)

GridSearchCV(cv=2,
             estimator=RandomForestRegressor(max_depth=15, min_samples_split=4),
             param_grid={'n_estimators': [5, 20, 50, 100]})

In [27]:
rf_grid6.best_params_

{'n_estimators': 100}

In [28]:
#Use Grid Search to find best parameters for random forest
random_forest_model = RandomForestRegressor(max_depth=15, min_samples_split=4, min_samples_leaf=1, max_features='auto', bootstrap=True)

rf_grid_param7 = {'n_estimators': [100, 115, 130]}

rf_grid7 = GridSearchCV(random_forest_model, rf_grid_param7, cv=2)

rf_grid7.fit(X_train_vectorized, y_train)

GridSearchCV(cv=2,
             estimator=RandomForestRegressor(max_depth=15, min_samples_split=4),
             param_grid={'n_estimators': [100, 115, 130]})

In [30]:
rf_grid7.best_params_

{'n_estimators': 100}

#### Running the tuned random forest

In [31]:
#Tuned random forest
tuned_forest = RandomForestRegressor(max_depth=15, n_estimators=100, min_samples_split=4, min_samples_leaf=1, max_features='auto', bootstrap=True)
tuned_forest_cv = cross_val_score(tuned_forest, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error')

In [32]:
tuned_forest_cv

array([-530.49842151, -531.79691278, -528.39640175, -532.35609186,
       -531.01993146])

In [35]:
print("Basic Random Forest:", -(random_forest_cv.mean()))
print("Tuned Random Forest:", -(tuned_forest_cv.mean()))

Basic Random Forest: 532.4097372781623
Tuned Random Forest: 530.8135518739608


#### Run test data with tuned random forest

In [None]:
# Train the model using the training sets
tuned_forest.fit(X_train_vectorized, y_train)

In [41]:
#Vectorize X_test
X_test_vectorized = tfidf.fit_transform(X_test)

In [42]:
# Make predictions using the testing set
y_pred = tuned_forest.predict(X_test_vectorized)

In [43]:
rmse_best_random_forest = mean_squared_error(y_test, y_pred, squared=False)

In [44]:
rmse_best_random_forest

532.8174097114581