<a href="https://colab.research.google.com/github/shweta24h/Worthy/blob/main/Project_Playstore_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
import numpy as np
import pandas as pd

## Importing necessary libraries
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import string
import re
import warnings

warnings.filterwarnings('ignore')

## Loading Data

In [None]:
## Loading the dataset
df = pd.read_csv("googleplaystore_user_reviews.csv")
df.head()

FileNotFoundError: ignored

In [None]:
df.info()

In [None]:
## Dropping missing values
df = df.dropna()

In [None]:
# Taking only the review and sentiment columns
df.drop(['App','Sentiment_Subjectivity'],axis=1,inplace=True)
df.head()

In [None]:
# Importing stop words
nltk.download("stopwords")
nltk.download("punkt")
stop_words = nltk.corpus.stopwords.words('english') + list(string.punctuation)

In [None]:
## Count number of words per review
df['Num_words'] = df['Translated_Review'].apply(lambda x : len([x for x in x.split()]))
df.head()

In [None]:
## Count of number of Stopwords
df['Num_stopwords'] = df['Translated_Review'].apply(lambda x : len([x for x in x.lower().split() if x in stop_words]))
df.head()

In [None]:
## Count of number of special characters
df['Num_special_char'] = df['Translated_Review'].apply(lambda x : len([x for x in x.split() if x in '[\w\s]']))
df.head()

In [None]:
## Count number of characters(without white space)
df['Num_chars'] = df['Translated_Review'].apply(lambda x : len(''.join([x for x in x.split()])))
df.head()

In [None]:
df.describe()

In [None]:
## Converting to lower case
df['Translated_Review'] = df['Translated_Review'].apply(lambda x : x.lower())
df.head()

In [None]:
## removing punctuation\
df['Translated_Review'] = df['Translated_Review'].str.replace('[^\w\s]','')
df.head()

In [None]:
## remove stopwords
df['Translated_Review'] = df['Translated_Review'].apply(lambda x : ' '.join(x for x in x.split() if x not in stop_words))
df.head()

In [None]:
## remove numbers
df['Translated_Review'] = df['Translated_Review'].apply(lambda x : ' '.join(x for x in x.split() if x.isdigit()==False))
df.head()

In [None]:
## Count of number of Stopwords
df['Num_stopwords'] = df['Translated_Review'].apply(lambda x : len([x for x in x.lower().split() if x in stop_words]))
df.head()

In [None]:
## Count of number of special characters
df['Num_special_char'] = df['Translated_Review'].apply(lambda x : len([x for x in x.split() if x in '[\w\s]']))
df.head()

In [None]:
## Count number of characters(without white space)
df['Num_chars'] = df['Translated_Review'].apply(lambda x : len(''.join([x for x in x.split()])))
df.head()

In [None]:
df.describe()

In [None]:
# Stem every word to it's root word
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['Translated_Review'] = df['Translated_Review'].apply(lambda x : ' '.join(st.stem(x) for x in x.split()))
df.head()

In [None]:
most_common = nltk.FreqDist(' '.join(df['Translated_Review']).split()).most_common(2000)

In [None]:
processed_features = df['Translated_Review']
labels = df['Sentiment']

In [None]:
## Creating TFIDF Matrix to convert the reviews into a set of vectors
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8)
processed_features = vectorizer.fit_transform(processed_features).toarray()

In [None]:
# Splitting dataset into Training and Test data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(processed_features,labels,test_size = 0.3)

## Naive-Bayes

In [None]:
# Naive-Bayes
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)

In [None]:
model.score(x_train, y_train)

## RANDOM FOREST MODEL

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(x_train,y_train)

In [None]:
rf_pred = rf_model.predict(x_test)
rf_model.score(x_train,y_train)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,rf_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test,rf_pred),annot=True,fmt = '.5g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(ticks = [0,1,2],labels = ['Positive','Neutral','Negative'])
plt.yticks(ticks = [0,1,2],labels = ['Negative','Neutral','Positive'])

## LOGISTIC REGRESSION MODEL

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr_model = lr.fit(x_train,y_train)
lr_model.score(x_train,y_train)

In [None]:
lr_pred = lr_model.predict(x_test)

In [None]:
print(classification_report(y_test,lr_pred))
sns.heatmap(confusion_matrix(y_test,lr_pred),annot=True,fmt = '.5g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(ticks = [0,1,2],labels = ['Negative','Neutral','Positive'])
plt.yticks(ticks = [0,1,2],labels = ['Negative','Neutral','Positive'])