In [1]:
# Modeling: Bag of Words Regression Model with sklearn Neural Net Regressor
### *Claps as Target*

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re

#modeling
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor


import warnings
warnings.filterwarnings('ignore')

In [3]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachelinsler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
filename = 'data/nlp_nltk_stemmed_preproc.csv'

In [6]:
df = pd.read_csv(filename).drop(columns = 'Unnamed: 0')

In [7]:
X = df['text']
y = df['claps']

In [8]:
#train-test split
X_train, X_test, y_train, y_test=train_test_split(X,
                                                 y,
                                                 test_size=.2,
                                                 random_state=42)

#### Count Vectorizer

In [9]:
cvec = CountVectorizer(min_df=5, max_df=.98, ngram_range=(1,2))

In [10]:
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [11]:
mlpreg = MLPRegressor(hidden_layer_sizes=(3,), activation='relu',
         solver='adam', alpha=0.1, batch_size='auto',
         learning_rate='adaptive', learning_rate_init=0.01,
         power_t=0.5, max_iter=1000, shuffle=True, random_state=9,
         tol=0.0001, verbose=False, warm_start=False, momentum=0.9,
         nesterovs_momentum=True, early_stopping=False,
         validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
         epsilon=1e-08)

In [12]:
mlpreg.fit(X_train_cvec,y_train)

MLPRegressor(alpha=0.1, hidden_layer_sizes=(3,), learning_rate='adaptive',
             learning_rate_init=0.01, max_iter=1000, random_state=9)

In [13]:
r2_score = mlpreg.score(X_train_cvec, y_train)
test_score = mlpreg.score(X_test_cvec, y_test)
cv_score = cross_val_score(mlpreg, X_train_cvec, y_train, cv = 5).mean()
train_RMSE = np.sqrt(mean_squared_error(y_train, mlpreg.predict(X_train_cvec)))
test_RMSE = np.sqrt(mean_squared_error(y_test, mlpreg.predict(X_test_cvec)))

print(f'R2 score: {r2_score}')
print(f'Test score: {test_score}')
print(f'CV score: {test_score}')
print(f'Train RMSE: {train_RMSE}')
print(f'Test RMSE: {test_RMSE}')

R2 score: 0.9916334967714606
Test score: 0.07756527478941311
CV score: 0.07756527478941311
Train RMSE: 10.700948397294704
Test RMSE: 107.9127992891644


The CV and test scores indicate that the bag of words model is not predictive of claps.

#### TfidfVectorizer

In [14]:
tvec = TfidfVectorizer(min_df=5, max_df=.98, ngram_range=(1,2))

In [15]:
X_train_tvec = tvec.fit_transform(X_train)
X_test_tvec = tvec.transform(X_test)

In [16]:
mlpreg = MLPRegressor(hidden_layer_sizes=(3,), activation='relu',
         solver='adam', alpha=0.1, batch_size='auto',
         learning_rate='adaptive', learning_rate_init=0.01,
         power_t=0.5, max_iter=1000, shuffle=True, random_state=9,
         tol=0.0001, verbose=False, warm_start=False, momentum=0.9,
         nesterovs_momentum=True, early_stopping=False,
         validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
         epsilon=1e-08)

In [17]:
mlpreg.fit(X_train_tvec,y_train)

MLPRegressor(alpha=0.1, hidden_layer_sizes=(3,), learning_rate='adaptive',
             learning_rate_init=0.01, max_iter=1000, random_state=9)

In [18]:
r2_score = mlpreg.score(X_train_tvec, y_train)
test_score = mlpreg.score(X_test_tvec, y_test)
cv_score = cross_val_score(mlpreg, X_train_tvec, y_train, cv = 5).mean()
train_RMSE = np.sqrt(mean_squared_error(y_train, mlpreg.predict(X_train_tvec)))
test_RMSE = np.sqrt(mean_squared_error(y_test, mlpreg.predict(X_test_tvec)))

print(f'R2 score: {r2_score}')
print(f'Test score: {test_score}')
print(f'CV score: {test_score}')
print(f'Train RMSE: {train_RMSE}')
print(f'Test RMSE: {test_RMSE}')

R2 score: 0.9982120098444842
Test score: 0.012594866785146541
CV score: 0.012594866785146541
Train RMSE: 4.946896680862746
Test RMSE: 111.64848308468554


The CV and test scores indicate that the bag of words model is not predictive of claps.