## Linear SVR and SVR

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
from textblob import TextBlob
from wordcloud import WordCloud

In [90]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.feature_extraction import text
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.preprocessing import StandardScaler

In [5]:
df = pd.read_csv("combined_season1-37.tsv.zip", delimiter='\t')

In [6]:
df

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes
0,1,100,no,LAKES & RIVERS,-,River mentioned most often in the Bible,the Jordan,1984-09-10,-
1,1,200,no,LAKES & RIVERS,-,Scottish word for lake,loch,1984-09-10,-
2,1,800,yes,LAKES & RIVERS,-,River in this famous song:,the Volga River,1984-09-10,-
3,1,400,no,LAKES & RIVERS,-,American river only 33 miles shorter than the ...,the Missouri,1984-09-10,-
4,1,500,no,LAKES & RIVERS,-,"World's largest lake, nearly 5 times as big as...",the Caspian Sea,1984-09-10,-
...,...,...,...,...,...,...,...,...,...
389440,2,400,no,FOUNDRY,-,This hefty noisemaker from Whitechapel Foundry...,Big Ben,2021-08-13,-
389441,2,800,no,FOUNDRY,-,"Around 4,000 years ago, the first foundries in...",bronze,2021-08-13,-
389442,2,1200,no,FOUNDRY,-,Several different foundries worked for 4 month...,Monitor,2021-08-13,-
389443,2,1600,no,FOUNDRY,-,"Once one of the largest of its kind, the Gary ...",U.S. Steel,2021-08-13,-


In [7]:
df['air_date'] = pd.to_datetime(df['air_date'])

In [8]:
#On 11/26/2001, the values for the questions doubled for both rounds of Jeopardy. Need to adjust the earlier episodes to have the same values as post-11/26/2001 shows.
df.loc[df['air_date'] < '2001-11-26', "value"] = df.value * 2

In [9]:
#remove Daily Doubles since the contestants can wager any amounts for those
df = df[df["daily_double"] != 'yes']

In [10]:
#keep only standard values (this will remove Final Jeopardy questions, which do not have a set amount and set are at '0', as well as the handful of non-standard values that are likely typos)
df = df.loc[df['value'].isin([200, 400, 600, 800, 1000, 400, 800, 1200, 1600, 2000])]

## NLP Data Cleaning

In [11]:
# import string to remove punctuation

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
def no_punctuation(text):
    nopunct=[words for words in text if words not in string.punctuation]
    words_without_punct=''.join(nopunct)
    return words_without_punct

In [13]:
#remove punctuation and lowercase words in 'category,' 'answer,' and 'question'

df['category'] = df['category'].apply(lambda x: no_punctuation(x).lower())
df['answer'] = df['answer'].apply(lambda x: no_punctuation(x).lower())
df['question'] = df['question'].apply(lambda x: no_punctuation(x).lower())

In [14]:
#remove numerals from 'category,' 'answer,' and 'question'

df['category'] = df['category'].str.replace('\d+', '')
df['answer'] = df['answer'].str.replace('\d+', '')
df['question'] = df['question'].str.replace('\d+', '')

In [15]:
df

Unnamed: 0,round,value,daily_double,category,comments,answer,question,air_date,notes
0,1,200,no,lakes rivers,-,river mentioned most often in the bible,the jordan,1984-09-10,-
1,1,400,no,lakes rivers,-,scottish word for lake,loch,1984-09-10,-
3,1,800,no,lakes rivers,-,american river only miles shorter than the mi...,the missouri,1984-09-10,-
4,1,1000,no,lakes rivers,-,worlds largest lake nearly times as big as su...,the caspian sea,1984-09-10,-
5,1,200,no,inventions,-,marconis wonderful wireless,the radio,1984-09-10,-
...,...,...,...,...,...,...,...,...,...
389438,2,1600,no,lost,-,in a moveable feast gertrude stein is quoted a...,lost generation,2021-08-13,-
389440,2,400,no,foundry,-,this hefty noisemaker from whitechapel foundry...,big ben,2021-08-13,-
389441,2,800,no,foundry,-,around years ago the first foundries in mesop...,bronze,2021-08-13,-
389442,2,1200,no,foundry,-,several different foundries worked for months...,monitor,2021-08-13,-


## Linear SVR

In [16]:
#Use 'answer' (Jeopardy questions) as the feature and 'value' as the target

X = df['answer']
y = df['value']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state=54)

In [18]:
#Instantiate the vector with stop words, max_features=1000, and bigrams
tfidf = TfidfVectorizer(stop_words='english', ngram_range= (1, 2), max_features=1000)

# Fit the vectorizer on X_train and transform it
X_train_vectorized = tfidf.fit_transform(X_train)

In [20]:
print("Shape of X_train_vectorized:", X_train_vectorized.shape)

Shape of X_train_vectorized: (272823, 1000)


In [23]:
#Basic Linear SVR

linear_SVR_regressor = LinearSVR()

linear_SVR_cv = cross_val_score(linear_SVR_regressor, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
linear_SVR_cv

array([-538.015546  , -538.15476162])

In [24]:
print("Basic Linear SVR:", -(linear_SVR_cv.mean()))

Basic Linear SVR: 538.0851538096358


In [29]:
#Tweaking Linear SVR

linear_SVR_regressor_l2loss = LinearSVR(loss='squared_epsilon_insensitive')

linear_SVR_cv_l2loss = cross_val_score(linear_SVR_regressor_l2loss, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
linear_SVR_cv_l2loss

array([-527.24034472, -527.60050114])

In [31]:
print("Basic Linear SVR:", -(linear_SVR_cv.mean()))
print("Linear SVR L2 Loss:", -(linear_SVR_cv_l2loss.mean()))

Basic Linear SVR: 538.0851538096358
Linear SVR L2 Loss: 527.4204229282802


In [61]:
#Tweaking Linear SVR

linear_SVR_regressor_tol = LinearSVR(loss='squared_epsilon_insensitive', tol = 1e-8)

linear_SVR_cv_tol = cross_val_score(linear_SVR_regressor_tol, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
linear_SVR_cv_tol

array([-527.24054804, -527.60054926])

In [62]:
print("Basic Linear SVR:", -(linear_SVR_cv.mean()))
print("Linear SVR L2 Loss:", -(linear_SVR_cv_l2loss.mean()))
print("Linear SVR Tolerance:", -(linear_SVR_cv_tol.mean()))

Basic Linear SVR: 538.0851538096358
Linear SVR L2 Loss: 527.4204229282802
Linear SVR Tolerance: 527.4205486470335


In [72]:
#Tweaking Linear SVR

linear_SVR_regressor_c = LinearSVR(loss='squared_epsilon_insensitive', C=.1)

linear_SVR_cv_c = cross_val_score(linear_SVR_regressor_c, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
linear_SVR_cv_c

array([-527.06857972, -527.45422942])

In [134]:
print("Basic Linear SVR:", -(linear_SVR_cv.mean()))
print("Linear SVR L2 Loss:", -(linear_SVR_cv_l2loss.mean()))
print("Linear SVR Tolerance:", -(linear_SVR_cv_tol.mean()))
print("Linear SVR C:", -(linear_SVR_cv_c.mean()))

Basic Linear SVR: 538.0851538096358
Linear SVR L2 Loss: 527.4204229282802
Linear SVR Tolerance: 527.4205486470335
Linear SVR C: 527.2614045728969


In [140]:
#Tweaking Linear SVR

linear_SVR_regressor_verbose = LinearSVR(loss='squared_epsilon_insensitive', C=.1, verbose = 10)

linear_SVR_cv_verbose = cross_val_score(linear_SVR_regressor_verbose, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=5)
linear_SVR_cv_verbose

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

array([-526.16061581, -527.81008967, -524.38984161, -528.41854293,
       -527.20302308])

In [141]:
print("Basic Linear SVR:", -(linear_SVR_cv.mean()))
print("Linear SVR L2 Loss:", -(linear_SVR_cv_l2loss.mean()))
print("Linear SVR Tolerance:", -(linear_SVR_cv_tol.mean()))
print("Linear SVR Tolerance:", -(linear_SVR_cv_c.mean()))
print("Linear SVR Verbose:", -(linear_SVR_cv_verbose.mean()))

Basic Linear SVR: 538.0851538096358
Linear SVR L2 Loss: 527.4204229282802
Linear SVR Tolerance: 527.4205486470335
Linear SVR Tolerance: 527.2614045728969
Linear SVR Verbose: 526.7964226200212


#### Run test data with tuned linear SVC

In [142]:
# Cross validate best tuned Linear SVC

# Train the model using the training sets
linear_SVR_regressor_verbose.fit(X_train_vectorized, y_train)

[LibLinear]

LinearSVR(C=0.1, loss='squared_epsilon_insensitive', verbose=10)

In [143]:
#Vectorize X_test
X_test_vectorized = tfidf.fit_transform(X_test)

In [144]:
# Make predictions using the testing set
y_pred = linear_SVR_regressor_verbose.predict(X_test_vectorized)

In [145]:
rmse_best_linear_svc = mean_squared_error(y_test, y_pred, squared=False)

In [146]:
rmse_best_linear_svc

535.7827738378692

### SVR

In [83]:
#Basic SVR

SVR_regressor = SVR(max_iter=100)

SVR_cv = cross_val_score(SVR_regressor, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
SVR_cv



array([-578.34792741, -579.24311364])

In [84]:
print("Basic SVR:", -(SVR_cv.mean()))

Basic SVR: 578.7955205264072


In [101]:
#Tweaking SVR

SVR_regressor_max = SVR(max_iter=5000)

SVR_cv_max = cross_val_score(SVR_regressor_max, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
SVR_cv_max



array([-576.35614576, -577.24640192])

In [120]:
print("Basic SVR:", -(SVR_cv.mean()))
print("SVR increased max_iter:", -(SVR_cv_max.mean()))

Basic SVR: 578.7955205264072
SVR increased max_iter: 576.8012738368016


In [104]:
#Tweaking SVR

SVR_regressor_linearkernel = SVR(max_iter=5000, kernel='linear')

SVR_cv_linearkernel = cross_val_score(SVR_regressor_linearkernel, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
SVR_cv_linearkernel



array([-577.6139814 , -578.40234812])

In [119]:
print("Basic SVR:", -(SVR_cv.mean()))
print("SVR increased max_iter:", -(SVR_cv_max.mean()))
print("SVR linear kernel:", -(SVR_cv_linearkernel.mean()))

Basic SVR: 578.7955205264072
SVR increased max_iter: 576.8012738368016
SVR linear kernel: 578.0081647598281


In [107]:
#Tweaking SVR

SVR_regressor_polykernel = SVR(max_iter=5000, kernel='poly')

SVR_cv_polykernel = cross_val_score(SVR_regressor_polykernel, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
SVR_cv_polykernel



array([-577.95419264, -578.85632656])

In [118]:
print("Basic SVR:", -(SVR_cv.mean()))
print("SVR increased max_iter:", -(SVR_cv_max.mean()))
print("SVR linear kernel:", -(SVR_cv_linearkernel.mean()))
print("SVR poly kernel:", -(SVR_cv_linearkernel.mean()))

Basic SVR: 578.7955205264072
SVR increased max_iter: 576.8012738368016
SVR linear kernel: 578.0081647598281
SVR poly kernel: 578.0081647598281


In [109]:
#Tweaking SVR

SVR_regressor_sigmoidkernel = SVR(max_iter=5000, kernel='poly')

SVR_cv_sigmoidkernel = cross_val_score(SVR_regressor_sigmoidkernel, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
SVR_cv_sigmoidkernel



array([-577.95419264, -578.85632656])

In [117]:
print("Basic SVR:", -(SVR_cv.mean()))
print("SVR increased max_iter:", -(SVR_cv_max.mean()))
print("SVR linear kernel:", -(SVR_cv_linearkernel.mean()))
print("SVR poly kernel:", -(SVR_cv_linearkernel.mean()))
print("SVR sigmoid kernel:", -(SVR_cv_sigmoidkernel.mean()))

Basic SVR: 578.7955205264072
SVR increased max_iter: 576.8012738368016
SVR linear kernel: 578.0081647598281
SVR poly kernel: 578.0081647598281
SVR sigmoid kernel: 578.4052595989834


will stick with the default kernel (rbf)

In [113]:
#Tweaking SVR

SVR_regressor_gamma = SVR(max_iter=5000, gamma='auto')

SVR_cv_gamma = cross_val_score(SVR_regressor_gamma, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
SVR_cv_gamma



array([-578.3286645 , -579.21267189])

In [116]:
print("Basic SVR:", -(SVR_cv.mean()))
print("SVR increased max_iter:", -(SVR_cv_max.mean()))
print("SVR linear kernel:", -(SVR_cv_linearkernel.mean()))
print("SVR poly kernel:", -(SVR_cv_linearkernel.mean()))
print("SVR sigmoid kernel:", -(SVR_cv_sigmoidkernel.mean()))
print("SVR gamma:", -(SVR_cv_gamma.mean()))

Basic SVR: 578.7955205264072
SVR increased max_iter: 576.8012738368016
SVR linear kernel: 578.0081647598281
SVR poly kernel: 578.0081647598281
SVR sigmoid kernel: 578.4052595989834
SVR gamma: 578.7706681965676


will stick with the default gamma (scale)

In [125]:
#Tweaking SVR

SVR_regressor_tol = SVR(max_iter=5000, tol=1e-8)

SVR_cv_tol = cross_val_score(SVR_regressor_tol, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
SVR_cv_tol



array([-576.35614576, -577.24640192])

In [126]:
print("Basic SVR:", -(SVR_cv.mean()))
print("SVR increased max_iter:", -(SVR_cv_max.mean()))
print("SVR linear kernel:", -(SVR_cv_linearkernel.mean()))
print("SVR poly kernel:", -(SVR_cv_linearkernel.mean()))
print("SVR sigmoid kernel:", -(SVR_cv_sigmoidkernel.mean()))
print("SVR gamma:", -(SVR_cv_gamma.mean()))
print("SVR tolerance for stopping criterion:", -(SVR_cv_gamma.mean()))

Basic SVR: 578.7955205264072
SVR increased max_iter: 576.8012738368016
SVR linear kernel: 578.0081647598281
SVR poly kernel: 578.0081647598281
SVR sigmoid kernel: 578.4052595989834
SVR gamma: 578.7706681965676
SVR tolerance for stopping criterion: 578.7706681965676


will keep default tolerance (0.001)

In [132]:
#Tweaking SVR

SVR_regressor_c = SVR(max_iter=5000, C=0.8)

SVR_cv_c = cross_val_score(SVR_regressor_c, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=2)
SVR_cv_c



array([-576.6325775 , -577.63279238])

In [133]:
print("Basic SVR:", -(SVR_cv.mean()))
print("SVR increased max_iter:", -(SVR_cv_max.mean()))
print("SVR linear kernel:", -(SVR_cv_linearkernel.mean()))
print("SVR poly kernel:", -(SVR_cv_linearkernel.mean()))
print("SVR sigmoid kernel:", -(SVR_cv_sigmoidkernel.mean()))
print("SVR gamma:", -(SVR_cv_gamma.mean()))
print("SVR tolerance for stopping criterion:", -(SVR_cv_gamma.mean()))
print("SVR smaller C:", -(SVR_cv_c.mean()))

Basic SVR: 578.7955205264072
SVR increased max_iter: 576.8012738368016
SVR linear kernel: 578.0081647598281
SVR poly kernel: 578.0081647598281
SVR sigmoid kernel: 578.4052595989834
SVR gamma: 578.7706681965676
SVR tolerance for stopping criterion: 578.7706681965676
SVR smaller C: 577.1326849432296


In [147]:
#Cross Validate best tuned SVR

SVR_regressor_max = SVR(max_iter=10000)

SVR_regressor_cv_max = cross_val_score(SVR_regressor_max, X_train_vectorized, y_train, scoring='neg_root_mean_squared_error', cv=5)
SVR_regressor_cv_max



array([-575.27553439, -576.48081035, -576.15486822, -577.0988126 ,
       -575.99102385])

In [149]:
print("SVR increased max_iter:", -(SVR_regressor_cv_max.mean()))

SVR increased max_iter: 576.2002098825078
