In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df= pd.read_csv(r'../input/ffdata/Reviews.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df['helpful_perc']=np.where(df['HelpfulnessDenominator']>0,df['HelpfulnessNumerator']/df['HelpfulnessDenominator'],-1)

In [None]:
## defining bins

df['upvote_perc']=pd.cut(df['helpful_perc'],bins=[-1,0,0.2,0.4,0.6,0.8,1],labels=['Empty','0-20%','20-40%','40-60%','60-80%','80-100%'])

In [None]:
df.head()

#### Analyzing Upvotes based on scores.

In [None]:
df.groupby(['Score','upvote_perc']).agg('count')

In [None]:
df_s= df.groupby(['Score','upvote_perc']).agg({'Id':'count'}).reset_index()

In [None]:
df_s

In [None]:
pivot=df_s.pivot(index='upvote_perc',columns='Score')
pivot

In [None]:
import seaborn as sns

In [None]:
sns.heatmap(pivot,annot=True, cmap='YlGnBu')

#### We can notice that most of the reviews are 5 star

In [None]:
df['Score'].unique()

In [None]:
## Assuming neutral reviews to be 3, therefore filtering them

df2 = df[df['Score']!=3]

In [None]:
df2['Score'].unique()

#### Defining X and Y

In [None]:
X = df2['Text']

In [None]:
## before defining y we need to map the reviews with 0 & 1, where reviews with 1&2 represent 0 and 4&5 represent 1

y_dict = {1:0, 2:0, 4:1, 5:1}

y = df2['Score'].map(y_dict)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
c = CountVectorizer(stop_words='english')

In [None]:
X_c = c.fit_transform(X)

In [None]:
X_c.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X_c,y)

In [None]:
X_train.shape

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log= LogisticRegression()

In [None]:
ml = log.fit(X_train,y_train)

In [None]:
ml.score(X_test,y_test)

### Top 20 positive & negative words

In [None]:
## getting feature names

w= c.get_feature_names()

In [None]:
coef= ml.coef_.tolist()[0]

In [None]:
coef_df= pd.DataFrame({"Word":w,"coeffcient":coef})
coef_df.head()

In [None]:
coef_df= coef_df.sort_values(['coeffcient','Word'],ascending=False)

In [None]:
## Top 20 positive words

coef_df.head(20)

In [None]:
coef_df.tail(20)

### Predictions

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
def predict(X,y,nlp_model,ml_model):
    X_c=nlp_model.fit_transform(X)
    X_train,X_test,y_train,y_test= train_test_split(X_c,y)
    ml = ml_model.fit(X_train,y_train)
    predictions= ml.predict(X_test)
    cm= confusion_matrix(predictions, y_test)
    print(cm)
    acc= accuracy_score(predictions, y_test)
    print(acc)

In [None]:
c=CountVectorizer()
lr=LogisticRegression()

In [None]:
predict(X,y,c,lr)

### Upvote Prediction

In [None]:
### Per heatmap high number of users have left five star reviews, therefore we'll filter these review for analysis

data= df[df['Score']==5]
data.head()

In [None]:
data['upvote_perc'].unique()

In [None]:
## removing nuetral votes

data2= data[data['upvote_perc'].isin(['80-100%','60-80%','20-40%', '0-20%'])]

In [None]:
## setting up independent data

X= data2['Text']

In [None]:
y_dict= {'80-100%':1,'60-80%':1,'20-40%':0, '0-20%':0}

y= data2['upvote_perc'].map(y_dict)

In [None]:
y.value_counts()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf= TfidfVectorizer(stop_words='english')

In [None]:
x_c= tf.fit_transform(X)

### Handling Imbalance Data

In [None]:
!pip install -U tensorflow==2.0.0
import tensorflow as tf
tf.__version__

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
os= RandomOverSampler()

In [None]:
X_train_res,y_train_res= os.fit_resample(x_c, y)

In [None]:
X_train_res.shape

In [None]:
y_train_res.shape

In [None]:
from collections import Counter

In [None]:
print('Original datset shape {}'.format(Counter(y)))
print('Resampled datset shape {}'.format(Counter(y_train_res)))

### Cross-validation using Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
log_class= LogisticRegression()

In [None]:
grid= {'C':10.0**np.arange(-2,3), 'penalty':['l1', 'l2']}

In [None]:
clf= GridSearchCV(estimator= log_class, param_grid= grid, cv=5, n_jobs= -1, scoring='f1_macro')

In [None]:
clf.fit(X_train_res, y_train_res)

In [None]:
## train test split

X_train,X_test,y_train,y_test= train_test_split(x_c, y)

In [None]:
pred = clf.predict(X_test)

In [None]:
## Confusion matrix

confusion_matrix(y_test, pred)

In [None]:
## Accuracy score

accuracy_score(y_test, pred)