In [18]:
import pandas as pd
import numpy as np
import nltk
nltk.download('words')
import re
from bs4 import BeautifulSoup
import re

[nltk_data] Downloading package words to
[nltk_data]     /Users/devanshugupta/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [22]:
# Accessing data file: ./data/fileName

youtube_dataset = pd.read_csv('./data/youtube_dislike_dataset.csv', index_col = False, usecols = ['dislikes','comments'])

In [24]:
len(youtube_dataset)

37422

In [25]:
#predicted value column - dislikes

type(youtube_dataset.loc[0,'dislikes'])

numpy.int64

In [20]:
words = set(nltk.corpus.words.words())

len(words)

235892

# Data Cleaning


## Pre-processing

### Converting review text to lowercase


In [27]:
youtube_dataset['comments'] = youtube_dataset['comments'].str.lower()

### Removing links

In [28]:
#removing all the URL links from the comment text. 
#For this purpose, we are removing all the words in the comment text which begin with http

youtube_dataset['comments'] = youtube_dataset['comments'].apply(lambda comment: re.sub(r"http\S+", "", str(comment)))


### Removing spaces

In [29]:
#removing all the spaces from the comment body using the strip method.

youtube_dataset['comments'] = youtube_dataset['comments'].str.strip()

### Removing non alphabetical chars

In [30]:
#removing all the nonalphabetical chars

youtube_dataset['comments'] = youtube_dataset['comments'].apply(lambda comment: re.sub(r'[^a-z0-9 ]','', str(comment)))

### remove the stop words 

In [31]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/devanshugupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
#In the stopwords removal step, we are splitting the comment into words. Then, we are filtering out stopwords and 
# then we are joining the rest of the words

from nltk.corpus import stopwords
 
youtube_dataset['comments'] = youtube_dataset['comments'].apply(lambda x : " ".join([word for word in x.split() if word not in stopwords.words('english')]))

In [34]:
#Creating the vectorizers for training set using TFIDF
#For training data, we are performing the transform step and the fit step in 1 go


from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
comments_transformed = vectorizer.fit_transform(youtube_dataset['comments'])

In [35]:
#Splitting the train and test data in 80-20 ratio

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(comments_transformed, youtube_dataset['dislikes'], test_size = 0.2, train_size = 0.8)



# ML 

In [37]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(x_train, y_train)


y_pred = reg.predict(x_test)

In [62]:
from sklearn.tree import DecisionTreeRegressor

rng = np.random.RandomState(1)

# Fit regression model
regr = DecisionTreeRegressor(max_depth=2)
regr.fit(x_train, y_train)
# Predict


y_pred_DT = regr.predict(x_test)



In [79]:
from sklearn.svm import SVR

rng = np.random.RandomState(1)

svr = SVR().fit(x_train, y_train)

In [80]:
# Predict
y_pred_SVR = svr.predict(x_test)

In [75]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(max_depth=3)
rfr.fit(x_train, y_train)
y_pred_RFR = rfr.predict(x_test)

## Mean Squared Error

In [44]:
import math

from sklearn.metrics import mean_squared_error

rmse = math.sqrt(mean_squared_error(y_test, y_pred))

print(rmse)

35005.821222426566


In [68]:
rmse = math.sqrt(mean_squared_error(y_test, y_pred_DT))

print(rmse)

32192.011223785503


In [81]:
rmse = math.sqrt(mean_squared_error(y_test, y_pred_SVR))

print(rmse)

32479.964962909296


In [77]:
rmse = math.sqrt(mean_squared_error(y_test, y_pred_RFR))

print(rmse)

32192.5402829946


## Mean Absolute Error

In [73]:
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_pred))


16922.51632971837


In [71]:
print(mean_absolute_error(y_test, y_pred_DT))

6784.545852932969


In [82]:
print(mean_absolute_error(y_test, y_pred_SVR))

4807.660173329713


In [78]:
print(mean_absolute_error(y_test, y_pred_RFR))

6751.571641941469
