In [1]:
## Random Forest Classifier on our dataset

import pandas as pd
import numpy as np

jokes_df = pd.read_json('datasets/final_user_joke_dataset.json')

jokes_df.head()

Unnamed: 0,joke_id,USF1000_rating,USF1001_rating,USF1002_rating,datasource_joke_id,jokeText,jester,reddit,stupidstuff,toxic,...,sexist,age_24,age_26,gender_Female,gender_Male,ethnicity_Middle Eastern,ethnicity_South East Asian,country_India,country_Iran,location_San Francisco
0,RE5tz52q,1,8,0,5tz52q,I hate how you cant even say black paint anymo...,0,1,0,0.948757,...,0.009227,0.033333,0.266667,0.033333,0.266667,0.0,0.3,0.3,0.0,0.3
1,RE5tz4dd,1,8,0,5tz4dd,What's the difference between a Jew in Nazi Ge...,0,1,0,0.740485,...,0.010509,0.033333,0.266667,0.033333,0.266667,0.0,0.3,0.3,0.0,0.3
2,RE5tz319,0,9,8,5tz319,I recently went to America....\n...and being t...,0,1,0,0.7138,...,0.008724,0.0,0.566667,0.266667,0.3,0.266667,0.3,0.3,0.266667,0.566667
3,RE5tz2wj,3,9,0,5tz2wj,"Brian raises his hand and says, “He’s in Heave...",0,1,0,0.863763,...,0.01006,0.1,0.3,0.1,0.3,0.0,0.4,0.4,0.0,0.4
4,RE5tz1pc,5,3,8,5tz1pc,You hear about the University book store worke...,0,1,0,0.556188,...,0.006886,0.166667,0.366667,0.433333,0.1,0.266667,0.266667,0.266667,0.266667,0.533333


In [2]:
## Read in users.csv

users = pd.read_csv('datasets/users.csv')

users.head()

Unnamed: 0,user_id,name,age,gender,ethnicity,country,location,avg_user_rating,city,buddy
0,USF1000,Tanya,24,Female,South East Asian,India,San Francisco,5.968586,Delhi,USF1001
1,USF1001,rishab,26,Male,South East Asian,India,San Francisco,6.727749,Mumbai,USF1000
2,USF1002,Parisa,26,Female,Middle Eastern,Iran,San Francisco,0.806283,Ahvaz,USF1001


### Picking the target user as USF1000 for an example

In [4]:
target_user = "USF1000"

## get a dict of target user traits

target_user_traits = users[users['user_id'] == target_user].to_dict('records')[0]

target_user_traits

{'user_id': 'USF1000',
 'name': 'Tanya',
 'age': 24,
 'gender': 'Female',
 'ethnicity': 'South East Asian',
 'country': 'India',
 'location': 'San Francisco',
 'avg_user_rating': 5.968586387434555,
 'city': 'Delhi',
 'buddy': 'USF1001'}

In [4]:
## Get info of taget user's buddy

target_user_buddy_traits = users[users['user_id'] == target_user_traits['buddy']].to_dict('records')[0]

target_user_buddy_traits

{'user_id': 'USF1001',
 'name': 'rishab',
 'age': 26,
 'gender': 'Male',
 'ethnicity': 'South East Asian',
 'country': 'India',
 'location': 'San Francisco',
 'avg_user_rating': 6.727748691099476,
 'city': 'Mumbai',
 'buddy': 'USF1000'}

In [5]:
# Get columns to keep for our model 

relevant_columns = set()

traits_to_ignore = ['user_id', 'buddy', 'name', 'avg_user_rating', 'city']

for key, value in target_user_traits.items():
    if key not in traits_to_ignore:
        relevant_columns.add(str(key) + "_" + str(value))


for key, value in target_user_buddy_traits.items():
    if key not in traits_to_ignore:
        relevant_columns.add(str(key) + "_" + str(value))

relevant_columns = list(relevant_columns)

In [6]:
similarity_threshold = 0.30

for col in relevant_columns:
    jokes_df = jokes_df[jokes_df[col] > similarity_threshold]

In [7]:
jokes_df.columns

Index(['joke_id', 'USF1000_rating', 'USF1001_rating', 'USF1002_rating',
       'datasource_joke_id', 'jokeText', 'jester', 'reddit', 'stupidstuff',
       'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate',
       'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise',
       'misogynist', 'sexist', 'age_24', 'age_26', 'gender_Female',
       'gender_Male', 'ethnicity_Middle Eastern', 'ethnicity_South East Asian',
       'country_India', 'country_Iran', 'location_San Francisco'],
      dtype='object')

In [8]:
original_relevant_columns = [
    target_user_traits['user_id'] + '_rating',
    # target_user_buddy_traits['user_id'] + '_rating',
    # 'joke_id',
    # 'datasource_joke_id',
    # 'jokeText',
    'jester', 
    'reddit', 
    'stupidstuff',
    'toxic', 
    'severe_toxic', 
    'obscene', 
    'threat', 
    'insult', 
    'identity_hate',
    'anger', 
    'disgust', 
    'fear', 
    'joy', 
    'neutral', 
    'sadness', 
    'surprise',
    'misogynist', 
    'sexist'
]

original_relevant_columns.extend(relevant_columns)

jokes_df = jokes_df[original_relevant_columns]

jokes_df.head()

Unnamed: 0,USF1000_rating,jester,reddit,stupidstuff,toxic,severe_toxic,obscene,threat,insult,identity_hate,...,surprise,misogynist,sexist,gender_Female,country_India,age_26,ethnicity_South East Asian,age_24,gender_Male,location_San Francisco
65,9,1,0,0,0.767418,0.019666,0.068701,0.021141,0.081694,0.041381,...,0.002103,0.000242,0.021917,0.3,0.633333,0.333333,0.633333,0.3,0.333333,0.633333
105,9,0,0,1,0.840182,0.003034,0.055259,0.008206,0.066514,0.026806,...,0.002668,0.000193,0.026017,0.6,0.633333,0.633333,0.633333,0.3,0.333333,0.933333
115,9,0,0,1,0.803803,0.019157,0.046692,0.02492,0.05649,0.048938,...,0.042148,0.000262,0.014791,0.6,0.633333,0.633333,0.633333,0.3,0.333333,0.933333
120,9,0,0,1,0.902783,0.006285,0.030891,0.009959,0.035003,0.015078,...,0.081667,0.000175,0.010131,0.3,0.633333,0.333333,0.633333,0.3,0.333333,0.633333
132,9,0,0,1,0.753506,0.021958,0.091486,0.023492,0.06959,0.039968,...,0.161406,0.000318,0.01277,0.3,0.633333,0.333333,0.633333,0.3,0.333333,0.633333


In [9]:
label_df = jokes_df[target_user_traits['user_id'] + '_rating']

In [10]:
from sklearn.model_selection import train_test_split

jokes_df = jokes_df.drop([target_user_traits['user_id'] + '_rating', target_user_traits['user_id'] + '_rating'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(jokes_df, label_df , test_size=0.2, random_state=42)

In [11]:
X_train.head()

Unnamed: 0,jester,reddit,stupidstuff,toxic,severe_toxic,obscene,threat,insult,identity_hate,anger,...,surprise,misogynist,sexist,gender_Female,country_India,age_26,ethnicity_South East Asian,age_24,gender_Male,location_San Francisco
158,0,0,1,0.945948,0.000666,0.012264,0.0005,0.029687,0.010936,0.001126,...,0.020176,0.999734,0.491376,0.3,0.633333,0.333333,0.633333,0.3,0.333333,0.633333
159,0,0,1,0.844782,0.011674,0.050093,0.013947,0.05332,0.026184,0.003799,...,0.081153,0.000164,0.008071,0.333333,0.666667,0.333333,0.666667,0.333333,0.333333,0.666667
142,0,0,1,0.798181,0.003841,0.118678,0.005833,0.056477,0.01699,0.048325,...,0.001994,0.000461,0.020686,0.3,0.633333,0.333333,0.633333,0.3,0.333333,0.633333
168,0,0,1,0.435453,0.094279,0.13773,0.091211,0.131759,0.109567,0.001842,...,0.843719,0.000275,0.014504,0.3,0.633333,0.333333,0.633333,0.3,0.333333,0.633333
105,0,0,1,0.840182,0.003034,0.055259,0.008206,0.066514,0.026806,0.271018,...,0.002668,0.000193,0.026017,0.6,0.633333,0.633333,0.633333,0.3,0.333333,0.933333


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


# params = {'max_depth': [3, 4, 5], # Go smaller
#           'criterion': ['entropy', 'gini'],
#           'n_estimators': [115, 130, 150], # Go smaller, 3/5/10,x,
#           'max_features': [None]
#         }

# params = {'max_depth': [None, 5, 10, 20, 30], 
#           'criterion': ['absolute_error', 'squared_error'],
#           'n_estimators': [115, 130, 150, 200], 
#           'max_features': [None, 'log2']
#         }

params = {'max_depth': [5, 10, 9],
          'criterion': ['squared_error'],
          'n_estimators': [50, 150, 200]
        }


folds = 5
rf = RandomForestRegressor()

clf = GridSearchCV(rf, params, cv=folds, n_jobs=-1)

clf.fit(X_train, y_train)
print(clf.best_params_)

{'criterion': 'squared_error', 'max_depth': 9, 'n_estimators': 50}


In [13]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Train the model with the best parameters obtained

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

rf = RandomForestRegressor(**clf.best_params_, random_state=42, n_jobs=-1)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy = rf.score(X_test, y_test)
print("Accuracy->", accuracy)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE->", rmse)

Accuracy-> 1.0
RMSE-> 0.0


In [14]:
# implent K-Nearest Neighbors regressor (KNN) algorithm to predict the rating of a joke for a user

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = knn.score(X_test, y_test)

print("Accuracy->", accuracy)

Accuracy-> 0.5


In [15]:
# implement SVM for the same

from sklearn.svm import SVC

svm = SVC()

svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

accuracy = svm.score(X_test, y_test)

print("Accuracy->", accuracy)

Accuracy-> 0.5


In [17]:
# implent K-Nearest Neighbors regressor (KNN) algorithm to predict the rating of a joke for a user

from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print("MSE->", mse)

MSE-> 0.46000000000000046


In [16]:
# Implement Gradient Boosting Regressor

from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()

gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mse = format(mse, ".3f")
print("MSE-> ", mse)

accuracy = gbr.score(X_test, y_test)
accuracy = format(accuracy, ".3f")
print("Accuracy-> ", accuracy)

MSE->  0.000
Accuracy->  1.000
