In [1]:
#install and import required libraries

import pickle as pkl
import numpy as np
import pandas as pd
import nltk
from sklearn.metrics import accuracy_score 
nltk.download('stopwords')
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pradxn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#read the dataset and display it

review_data = pd.read_csv('chrome_reviews.csv')
review_data

Unnamed: 0,ID,Review URL,Text,Star,Thumbs Up,User Name,Developer Reply,Version,Review Date,App ID
0,3886,https://play.google.com/store/apps/details?id=...,This is very helpfull aap.,5,0,INDIAN Knowledge,,83.0.4103.106,2020-12-19,com.android.chrome
1,3887,https://play.google.com/store/apps/details?id=...,Good,3,2,Ijeoma Happiness,,85.0.4183.127,2020-12-19,com.android.chrome
2,3888,https://play.google.com/store/apps/details?id=...,Not able to update. Neither able to uninstall.,1,0,Priti D BtCFs-29,,85.0.4183.127,2020-12-19,com.android.chrome
3,3889,https://play.google.com/store/apps/details?id=...,Nice app,4,0,Ajeet Raja,,77.0.3865.116,2020-12-19,com.android.chrome
4,3890,https://play.google.com/store/apps/details?id=...,Many unwanted ads,1,0,Rams Mp,,87.0.4280.66,2020-12-19,com.android.chrome
...,...,...,...,...,...,...,...,...,...,...
7199,684987,https://play.google.com/store/apps/details?id=...,Bagusss..,5,0,boima panjaitan,,88.0.4324.93,2021-03-05,com.android.chrome
7200,684988,https://play.google.com/store/apps/details?id=...,Bad version 😔,1,0,निशान्त सिंह,,88.0.4324.181,2021-03-05,com.android.chrome
7201,684989,https://play.google.com/store/apps/details?id=...,One thing that I have to say I can't spelled t...,5,0,Virgie Allen,,89.0.4389.72,2021-03-05,com.android.chrome
7202,684990,https://play.google.com/store/apps/details?id=...,Excellent,5,0,Kazuo Guevarra,,89.0.4389.72,2021-03-05,com.android.chrome


In [3]:
#remove unwanted data and columns

data = review_data.drop(['Review URL', 'Thumbs Up', 'User Name', 'Developer Reply', 'Version', 'Review Date', 'App ID'], axis=1)
data

Unnamed: 0,ID,Text,Star
0,3886,This is very helpfull aap.,5
1,3887,Good,3
2,3888,Not able to update. Neither able to uninstall.,1
3,3889,Nice app,4
4,3890,Many unwanted ads,1
...,...,...,...
7199,684987,Bagusss..,5
7200,684988,Bad version 😔,1
7201,684989,One thing that I have to say I can't spelled t...,5
7202,684990,Excellent,5


In [4]:
#Assign if the text is positive or negative based on star ratings
#if rating is equal or greater than 2, then assign it as positive, else negative

data['Review'] = data['Star'].apply(lambda x: 'Positive' if x >= 2 else 'Negative')
data.head()

Unnamed: 0,ID,Text,Star,Review
0,3886,This is very helpfull aap.,5,Positive
1,3887,Good,3,Positive
2,3888,Not able to update. Neither able to uninstall.,1,Negative
3,3889,Nice app,4,Positive
4,3890,Many unwanted ads,1,Negative


In [5]:
#check the dataset for any null values, white spaces and other non-word characters which might affect the solution

redundant_data=re.findall('\W+',str(data['Text']))
redundant_data

['                              ',
 ' ',
 ' ',
 ' ',
 ' ',
 '.\n',
 '                                                    ',
 '\n',
 '          ',
 ' ',
 ' ',
 ' ',
 '. ',
 ' ',
 ' ',
 ' ',
 '.\n',
 '                                                ',
 ' ',
 '\n',
 '                                       ',
 ' ',
 ' ',
 '\n                              ...                        \n',
 '                                            ',
 '..\n',
 '                                        ',
 ' ',
 ' 😔\n',
 '    ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 "'",
 ' ',
 ' ',
 '...\n',
 '                                            ',
 '\n',
 '    ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ...\n',
 ': ',
 ', ',
 ': ',
 ', ',
 ': ']

In [6]:
#portstemmer algorithm is used to remove affixes of a given word, leaving only the main/root word


ps=PorterStemmer()

def cleaning_data(dataframe):
    corpus=[]
    for i in range(0,len(dataframe)):
        #remove non-word character data
        review=re.sub("[^a-zA-Z]"," ",str(dataframe['Text'][i]))
        #convert everything into lower case format
        review=review.lower()
        #split sentences into words
        review=review.split()
        #remove affixes
        review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
        #join words back to form sentences
        review=' '.join(review)
        #list of reviews
        corpus.append(review)
      

    for i in range(len(corpus)):
        dataframe['Text'][i]=corpus[i]
        
    #remove any '' or NaN even after cleaning data initially
    value_nan = float("NaN")
    dataframe.replace("", value_nan, inplace=True)
    dataframe=dataframe.dropna()
    dataframe.reset_index(inplace=True)
    corpus=list(dataframe['Text'])
        
    return dataframe

In [7]:
#clean the working data

data=cleaning_data(data)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Text'][i]=corpus[i]


Unnamed: 0,index,ID,Text,Star,Review
0,0,3886,helpful aap,5,Positive
1,1,3887,good,3,Positive
2,2,3888,abl updat neither abl uninstal,1,Negative
3,3,3889,nice app,4,Positive
4,4,3890,mani unwant ad,1,Negative


In [8]:
#split data into dependent and independent values

X=data['Text'].values
y=data['Review'].values

#split data into testing and training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1)

In [None]:
# Creating model for Logisticregression
from sklearn.linear_model import LogisticRegression
text_clf_Log = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clfLog', LogisticRegression())])
parameters_Log = {'vect__ngram_range': [(1, 1), (1, 2),(1,3)],'tfidf__use_idf': (True, False),
                  'clfLog__penalty':['l1','l2','elasticnet','none'],'clfLog__C':[1.0,1.5,2.0],
                  'clfLog__solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                  'clfLog__multi_class':['ovr']}
LogisticRegression_classifier = GridSearchCV(text_clf_Log, parameters_Log, n_jobs=-1)
LogisticRegression_classifier = LogisticRegression_classifier.fit(X_train,y_train)

Logistic_model=LogisticRegression_classifier.best_estimator_
y_pred=Logistic_model.predict(X_test)

print("Test accuracy score : ",accuracy_score(y_test,y_pred)) 
print(LogisticRegression_classifier.best_score_)
print(LogisticRegression_classifier.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt





In [None]:
#save the output in a pickle file

classifier = LogisticRegression()

pkl_out = open("chrome_reviews.pkl", "wb")
pkl.dump(classifier,pkl_out)
pkl_out.close()