In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

In [None]:
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [None]:
data = pd.read_csv('FinalData.csv')
data.drop(['Unnamed: 0','Upvote_ratio'], axis=1, inplace=True)

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
from nltk.stem import WordNetLemmatizer,PorterStemmer

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def text_preprocess(text):
    text = re.sub(r'[^\w\s]', '', text) 
    l_text = [word for word in text.lower().split() if word not in ENGLISH_STOP_WORDS]
    stem_words = [stemmer.stem(w) for w in l_text]
    lemma_words = [lemmatizer.lemmatize(w) for w in l_text]
    return " ".join(lemma_words)
data['Title'] = data['Title'].map(lambda com : text_preprocess(com))

In [None]:
i = 0
predicted_value = []
while i<len(data):
  if (data.loc[i]['compound'] >= 0.5):
    predicted_value.append('positive')
    i = i+1

  elif (data.loc[i]['compound'] >= 0) & (data.loc[i]['compound'] <= 0.5):
    predicted_value.append('neutral')
    i = i+1

  elif (data.loc[i]['compound'] <= 0):
    predicted_value.append('negative')
    i = i+1

In [None]:
data['Predicted_value'] = predicted_value
data.drop(['neg', 'neu', 'pos', 'compound'], axis=1, inplace=True)

In [None]:
X = data.drop(['Score'], axis=1)
y = data['Score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train["Title"])

In [None]:
train_title = tokenizer.texts_to_sequences(X_train["Title"])
test_title = tokenizer.texts_to_sequences(X_test["Title"])
vocab_size = len(tokenizer.word_index) + 1

In [None]:
indixes = tokenizer.word_index
tokens = {k:[indixes[k]] for k in indixes}
df_tokens = pd.DataFrame(tokens)
df_tokens.to_csv('tokenizer.csv', header=True, index=False)

In [None]:
maxlen = 300
train_title = pad_sequences(train_title, padding='post', maxlen=maxlen)
test_title = pad_sequences(test_title, padding='post', maxlen=maxlen)

In [None]:
embeddings_dictionary = dict()
glove_file = open('/content/drive/My Drive/glove.6B.100d.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
df = pd.DataFrame(data=embedding_matrix.astype(float))
df.to_csv('glove.csv', sep=' ', header=True, float_format='%.2f', index=False)

In [None]:
train_new = {}
for i, sentence in enumerate(train_title):
    vectors = []
    for n in sentence:
        vectors.append(embedding_matrix[n])
    train_new[i] = vectors

In [None]:
test_new = {}
for i, sentence in enumerate(test_title):
    vectors = []
    for n in sentence:
        vectors.append(embedding_matrix[n])
    test_new[i] = vectors

In [None]:
X_train_df = []
for key in train_new:
    arr = np.array(train_new[key])
    X_train_df.append(np.mean(arr, axis=0))

In [None]:
X_test_df = []
for key in test_new:
    arr = np.array(test_new[key])
    X_test_df.append(np.mean(arr, axis=0))

In [None]:
X_train_df = pd.DataFrame(np.array(X_train_df))
X_test_df = pd.DataFrame(np.array(X_test_df))

In [None]:
from sklearn.preprocessing import OneHotEncoder
categories = ['Over_18', 'Predicted_value']
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train[categories])
pickle.dump(enc, open('encoding.pkl','wb'))

In [None]:
col_names = [j for sub in enc.categories_ for j in sub] 

In [None]:
train_encoded = enc.transform(X_train[categories])
test_encoded = enc.transform(X_test[categories])

In [None]:
X_train.drop(["Title", 'Over_18', 'Predicted_value'], axis=1, inplace=True)
X_test.drop(["Title", 'Over_18', 'Predicted_value'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)

In [None]:
train = pd.DataFrame(train_encoded.todense(), columns=col_names)
test = pd.DataFrame(test_encoded.todense(), columns=col_names)

In [None]:
X_train = pd.concat([X_train, X_train_df, train], axis=1)
X_test = pd.concat([X_test, X_test_df, test], axis=1)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn import metrics

In [None]:
knn=KNeighborsRegressor()
knn.fit(X_train,y_train)
test_preds2=knn.predict(X_test)
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_preds2)))
print("RMSE TestData = ",str(RMSE_test))
print('RSquared value on test:',knn.score(X_test, y_test))

RMSE TestData =  7451.644931659574
RSquared value on test: 0.36109834692091725


In [None]:
lm=LinearRegression()   
lm = lm.fit(X_train,y_train)
test_pred = lm.predict(X_test)
RMSE_test = np.sqrt(mean_squared_error(y_test, test_pred))
print("RMSE TestData = ",str(RMSE_test))
print('RSquared value on test:',lm.score(X_test, y_test))

RMSE TestData =  8948.74957123513
RSquared value on test: 0.07858682970448438


In [None]:
DT=DecisionTreeRegressor()
DT.fit(X_train,y_train)
test_preds=DT.predict(X_test)
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_preds)))
print("RMSE TestData = ",str(RMSE_test))
print('RSquared value on test:',DT.score(X_test, y_test))

RMSE TestData =  9989.052566903001
RSquared value on test: -0.14809626350053007


In [None]:
RF=RandomForestRegressor(n_jobs=-1)
RF.fit(X_train,y_train)
test_preds1=RF.predict(X_test)
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_preds1)))
print("RMSE TestData = ",str(RMSE_test))
print('RSquared value on test:',RF.score(X_test, y_test))

RMSE TestData =  6810.250588597596
RSquared value on test: 0.46635076500393124


In [None]:
knn=KNeighborsRegressor()
knn.fit(X_train,y_train)
pickle.dump(knn, open('kn.pkl','wb'))

In [None]:
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
xgbr =xgb.XGBRegressor().fit(X_train, y_train)
test_preds6=xgbr.predict(X_test)
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_preds6)))
print("RMSE TestData = ",str(RMSE_test))
print('RSquared value on test:',xgbr.score(X_test, y_test))

RMSE TestData =  6559.128029755439
RSquared value on test: 0.5049809383493384


In [None]:
lasso = LassoCV(cv=10).fit(X_train, y_train)
test_preds3=lasso.predict(X_test)
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_preds3)))
print("RMSE TestData = ",str(RMSE_test))
print('RSquared value on test:',lasso.score(X_test, y_test))

RMSE TestData =  9049.800344322846
RSquared value on test: 0.05765983503903849


In [None]:
ridge = RidgeCV(cv=10).fit(X_train, y_train)
test_preds4=ridge.predict(X_test)
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_preds4)))
print("RMSE TestData = ",str(RMSE_test))
print('RSquared value on test:',ridge.score(X_test, y_test))

RMSE TestData =  8939.214392384392
RSquared value on test: 0.08054937397011164


In [None]:
elastic_net = ElasticNetCV(cv = 10).fit(X_train, y_train)
test_preds5=elastic_net.predict(X_test)
RMSE_test=(np.sqrt(metrics.mean_squared_error(y_test,test_preds5)))
print("RMSE TestData = ",str(RMSE_test))
print('RSquared value on test:',elastic_net.score(X_test, y_test))

RMSE TestData =  9002.04898892087
RSquared value on test: 0.06757813205862173
