In [None]:
import pandas as pd
import numpy as np
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
df = pd.read_csv('../input/reviewuniversalstudio/universal_studio_branches.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.tail()

In [None]:
df['branch'].value_counts()

In [None]:
df['rating'].value_counts()

In [None]:
df.columns

In [None]:
df['target'] = df.rating.apply(lambda x: 1 if x>=4.0 else 0)

In [None]:
df.head()

In [None]:
df = df[['rating','title','review_text', 'branch','target']]

In [None]:
df.head()

# GENERAL FEATURES AND PREPROCESSING

In this section,I will be extracting all the MAUAL FEATURES which will be useful in understanding the data and  PREPROCESSING AND CLEANING the data.
The steps will be

    GENERAL FEATURE EXTRACTION:
        1. WORD COUNTS
        2. CHARACTER COUNTS
        3. STOPWORDS COUNT
        4. HASTAGS(#) AND MENTION (@) COUNTS
        5. IF NUMERIC DIGITS ARE PRESENT
        6. UPPER CASE WORD COUNTS
            
    PREPROCESSING AND CLEANING:
        1. CONVERSION OF UPPER CASE TO LOWER CASE
        2. CONTRACTION TO EXPANSION
        3. EMAILS COUNT AND REMOVAL
        4. REMOVAL OF SPECIAL CHARACTERS
        5. REMOVAL OF HTML TAGS
        6. REMOVAL OF ACCENTED CHARS
        7. REMOVAL OF MULTIPLE SPACES
        8. REMOVAL OF COMMON OCCURING AND RARE OCCURING WORDS

In [None]:
df['wordcounts'] = df['title'].apply(lambda x: len(str(x).split()))
df['wordcounts_reviewtext'] = df['review_text'].apply(lambda x: len(str(x).split()))

In [None]:
df.head()

In [None]:
df['charactercounts'] = df['title'].apply(lambda x: len(x))
df['charactercounts_reviewtext'] = df['review_text'].apply(lambda x: len(x))

In [None]:
df.head()

In [None]:
def get_avg_word_len(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len = word_len + len(word)
    return word_len/len(words)

In [None]:
df['average_wordlength'] = df['review_text'].apply(lambda x: get_avg_word_len(x))

In [None]:
df.head()

In [None]:
print(STOP_WORDS)

In [None]:
df['Stopwords'] = df['review_text'].apply(lambda x: len([t for t in x.split() if t in STOP_WORDS]))

In [None]:
df.head()

In [None]:
df['hastagscounts'] = df['review_text'].apply(lambda x: len([t for t in x.split() if t.startswith('#')]))
df['mentions'] = df['review_text'].apply(lambda x: len([t for t in x.split() if t.startswith('@')]))

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df['numericcounts'] = df['review_text'].apply(lambda x: len([t for t in x.split() if t.isdigit()]))

In [None]:
df.head()

In [None]:
df['uppercount'] = df['review_text'].apply(lambda x: len([t for t in x.split() if t.isupper() and len(x)>3]))

In [None]:
df.head()

In [None]:
df['review_text'] = df['review_text'].apply(lambda x: x.lower())

In [None]:
df.head()

In [None]:
contractions = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "I would",
"i'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key,value)
        return x
    else:
        return x

In [None]:
%%time
df['review_text'] = df['review_text'].apply(lambda x:cont_to_exp(x))

In [None]:
df.head()

In [None]:
df['mentions']

In [None]:
df['hastagscounts']

In [None]:
import re

In [None]:
df['emails'] = df['review_text'].apply(lambda x: re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)',x))

In [None]:
df['emails_count'] = df['emails'].apply(lambda x:len(x))

In [None]:
df[df['emails_count']>0]

In [None]:
df['review_text'] = df['review_text'].apply(lambda x: re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x))

In [None]:
df[df['emails_count']>0]

In [None]:
df['urls_count'] = df['review_text'].apply(lambda x: len(re.findall(r'((http|ftp|https):\/\/)?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])?',x))) 

In [None]:
df.head()

In [None]:
df['urls_count']

In [None]:
df['review_text'] = df['review_text'].apply(lambda x:re.sub(r'((http|ftp|https):\/\/)?([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])?', '',x))

In [None]:
df.head()

In [None]:
df.loc[50899]['review_text']

In [None]:
df['review_text'] = df['review_text'].apply(lambda x: re.sub('RT',"",x))

In [None]:
df.head()

In [None]:
df['review_text'] = df['review_text'].apply(lambda x: re.sub('[^A-Z a-z 0-9-]+', '',x))

In [None]:
df.head()

In [None]:
df.loc[50899]['review_text']

In [None]:
df['review_text'] = df['review_text'].apply(lambda x: " ".join(x.split()))

In [None]:
df.head()

In [None]:
from bs4 import BeautifulSoup

In [None]:
%%time
df['review_text'] = df['review_text'].apply(lambda x:BeautifulSoup(x,'lxml').get_text())

In [None]:
import unicodedata

In [None]:
def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD',x).encode('ascii', 'ignore').decode('utf-8','ignore')
    return x

In [None]:
%%time
df['review_text'] = df['review_text'].apply(lambda x:remove_accented_chars(x))

In [None]:
df['review_text'] = df['review_text'].apply(lambda x: " ".join([t for t in x.split() if t not in STOP_WORDS]))

In [None]:
df.head()

In [None]:
df.loc[0]['review_text']

In [None]:
#'some', 'quite', 'a', 'anything', 'for', 'be', 'none', 'from', 'others', '‘re', 'one', 'around', 'our', 'same', 'mine', 'somewhere', 'has', 'does', 'every', 'less', 'eight', 'above', 'own', 'two', 'another', 'were', 'namely', 'its', 'keep', 'very', 'behind', 'eleven', 'had', 'regarding', 'indeed', 'few', 'more', 'myself', 'unless', 'their', '‘d', 'everyone', 'bottom', 'thru', "n't", 'put', 'amount', 'my', 'over', 'various', 'n‘t', 'am', 'such', 'up', 'everywhere', 'there', 'towards', '’s', 'nine', 'together', 'not', 'show', 'n’t', 'someone', 'here', 'never', 'to', 'yourselves', 'so', 'hereafter', 'him', 'thereby', 'alone', 'whom', 'therefore', 'an', 'already', 'due', 'whether', 'under', 'who', 'but', 'four', 'name', 'whence', 'she', 'themselves', '‘s', 'either', 'six', 'we', 'can', 'whereas', 'anyway', 'it', 'on', 'neither', 'get', 'wherever', '’d', 'been', 'into', 'latter', '‘ve', 'only', 'wherein', 'did', 'formerly', 'least', 'perhaps', 'say', 'beyond', 're', 'moreover', 'those', "'s", '’ve', 'will', 'me', 'because', 'whole', 'other', '’m', 'several', 'what', 'really', 'nothing', 'yours', 'used', 'while', 'that', 'therein', 'why', 'before', 'three', 'whereafter', 'yet', 'meanwhile', 'ca', 'how', '‘ll', 'using', "'m", "'ve", 'nobody', 'front', 'most', 'give', 'even', 'thereafter', 'else', 'mostly', 'nevertheless', 'of', 'former', 'is', 'beforehand', 'third', 'seem', 'whereupon', 'always', 'side', 'within', 'twelve', 'part', 'at', 'see', 'he', 'call', 'hereupon', 'also', 'per', 'are', 'must', 'besides', 'about', 'with', 'during', '‘m', 'noone', 'although', 'nor', 'would', 'the', 'you', 'thence', 'otherwise', 'ours', 'afterwards', 'elsewhere', 'go', 'down', 'five', 'hereby', 'whose', 'empty', 'thus', 'forty', 'ourselves', 'latterly', 'seemed', 'i', 'twenty', 'though', 'anyhow', 'just', 'anywhere', 'too', "'d", 'anyone', 'make', 'full', 'hence', 'ten', 'somehow', 'which', 'your', 'herein', 'top', 'move', 'everything', 'via', 'nowhere', 'cannot', 'out', 'as', 'next', 'further', 'itself', 'could', 'amongst', 'becomes', 'being', 'by', 'hers', 'upon', 'her', 'them', 'hundred', 'or', 'please', 'again', 'after', 'done', 'made', 'where', 'and', 'in', 'between', 'all', 'seeming', 'no', 'off', 'should', 'each', 'last', 'was', 'well', 'then', 'throughout', 'this', 'back', 'whoever', 'when', 'beside', 'us', 'among', 'whereby', "'ll", 'whither', 'thereupon', 'along', 'enough', 'ever', 'still', 'now', 'fifty', 'whatever', 'since', '’re', 'rather', 'his', 'take', 'however', 'do', 'except', 'onto', 'these', 'they', 'almost', 'sometimes', 'yourself', 'become', 'than', 'without', 'became', 'sixty', 'toward', 'serious', '’ll', 'himself', 'many', 'against', 'may', 'if', 'until', 'might', 'sometime', 'whenever', 'often', 'seems', 'fifteen', 'herself', 'across', 'once', "'re", 'doing', 'first', 'below', 'something', 'have', 'any', 'much', 'both', 'becoming', 'through'

In [None]:
df

In [None]:
text = ' '.join(df['review_text'])
len(text)

In [None]:
text = ' '.join(df['review_text'])

In [None]:
text = text.split()

In [None]:
freq_common = pd.Series(text).value_counts()
f20= freq_common[:20]

In [None]:
f20

In [None]:
df['review_text'] = df['review_text'].apply(lambda x: ' '.join([t for t in x.split() if t not in f20]))

In [None]:
df.head()

In [None]:
text = ' '.join(df['review_text'])
len(text)

In [None]:
rare20 = freq_common[-20:]

In [None]:
rare20

In [None]:
rare = freq_common[freq_common.values == 1]

In [None]:
rare


In [None]:
df['review_text'] = df['review_text'].apply(lambda x: ' '.join([t for t in x.split() if t not in rare]))

In [None]:
df.head()

In [None]:
text = ' '.join(df['review_text'])
len(text)

In [None]:
17722510 - 14661963

# VISUALIZING WORDCLOUD

We will be visualizing the Positive and Negative feedbacks that people have given in the rating. For this we can use WORDCLOUD for visualizing and understanding the behavioiur of visitors.
The below code is importing the WORDCLOUD and matplotlib library,In this we will define separately the positive and negative feedbacks and also we will be importing the stopwords by using nltk(which is natural language processing tool kit)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from nltk.corpus import stopwords
df_positive = df[df['target']==1]['review_text']

wordcloud1 = WordCloud(
                      background_color='white',
                      width=2500,
                      height=2000
                      ).generate(" ".join(df_positive))

plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud1)
plt.axis('off')
plt.show()

In [None]:
df_negative = df[df['target']==0]['review_text']

wordcloud1 = WordCloud(
                      background_color='white',
                      width=2500,
                      height=2000
                      ).generate(" ".join(df_negative))

plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud1)
plt.axis('off')
plt.show()

# TOKENIZATION

Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.Tokenization is the first step in the process of text preprocessing steps.It helps us to build the tokens(which is separating the words from the sentences),so that it will be easy to do the machine learning techniques.
By using nltk library we can easily tokenize the sentences and words.

In [None]:
import re

def tokenize(txt):
    tokens = re.split('\W+', txt)
    return tokens

df['review_text'] = df['review_text'].apply(lambda x: tokenize(x.lower()))

df.head()

# STOPWORDS REMOVAL

Removing stopwords is the important part of any nltk project,because there might be number of words which need to be processed and removed from the data.This can also be done with the help of nltk by incorporating "stopwords"

In [None]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords

In [None]:
def remove_stopwords(txt_tokenized):
    txt_clean = [word for word in txt_tokenized if word not in stopwords]
    return txt_clean

df['review_text'] = df['review_text'].apply(lambda x: remove_stopwords(x))

df.head()

# STEMMING

Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma. Stemming is important in natural language understanding (NLU) and natural language processing (NLP).In this we will be using PORTERSTEMMER,say in simple words stemming can be understood in below steps
1)run
2)running
3)ran
4)runs
So when stemming is incorporated all these words will be changed to same form(i.e 'run')

In [None]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
def stemming(tokenized_text):
    
    text = [ps.stem (word) for word in tokenized_text]
     
    return text

df['review_text'] = df['review_text'].apply(lambda x: stemming(x))

df.head()

# FEATURE EXTRACTION(BAG OF WORDS)

In this section of preprocessing we will pass the last commands befor extracting the bag of words for feature extraction.
In this part we will join the text in data,because the tokens in data set is separated by commas.
We need to remove those commas from tokens,so that there won't be any error while executing Countvectorizer.

In [None]:
import string

In [None]:
def clean_text(txt):
    txt = ''.join([c for c in txt if c is not string.punctuation])
    tokens = re.split('\W+', txt)
    text = [ps.stem (word) for word in tokens if word not in stopwords]
    return txt

In [None]:
df.head()

# MODEL BUILDING

As its clear that the data cleaning and processing part is completed.I will be using COUNTVECTORIZER to convert the tokens into numerical data because machines donot understand the categorical data.it need to be converted to numbers.While converting the sparse matrix will be created and it is represented in sparse matrix.In this section we are going to built the model train and test it.
Here I will use LOGISTIC REGRESSION model to train and test it.In this,I have used 70% of the data for training and 30% for testing. 

In [None]:
df[df['target']== 1]['review_text'].count()

In [None]:
df[df['target']== 0]['review_text'].count()

In [None]:
pos_train = df[df['target']==1][['review_text', 'target']].head(29200)
neg_train = df[df['target']==0][['review_text', 'target']].head(6430)

In [None]:
pos_test = df[df['target']==1][['review_text', 'target']].tail(12513)
neg_test = df[df['target']==0][['review_text', 'target']].tail(2755)

In [None]:
train_df = pd.concat([pos_train, neg_train]).sample(frac = 1).reset_index(drop=True)
test_df = pd.concat([pos_test, neg_test]).sample(frac = 1).reset_index(drop=True)

In [None]:
train_df.head()

In [None]:

X_train = train_df['review_text']
X_test  = test_df['review_text']
y_train = train_df['target']
y_test  = test_df['target']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True,analyzer= clean_text)
vectorizer.fit(X_train)
X_train_onehot = vectorizer.transform(X_train)
X_test_onehot = vectorizer.transform(X_test)

In [None]:
print(X_train_onehot.shape)
print(X_test_onehot.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import seaborn as sns

In [None]:
def fit_and_test(classifier, X_train, y_train, X_test, y_test, only_return_accuracy=False):
  classifier.fit(X_train, y_train)
  y_hat = classifier.predict(X_test)
  print('accuracy:', accuracy_score(y_test, y_hat))
  if not only_return_accuracy:
    print('f1_score:', f1_score(y_test, y_hat))

In [None]:
for c in [0.01, 0.02, 0.05, 0.25, 0.5, 0.75, 1,]:
  lr = LogisticRegression(C=c, max_iter=1000) # 92.91%
  print (f'At C = {c}:-', end=' ')
  fit_and_test(lr, X_train_onehot, y_train, X_test_onehot, y_test, True)

In [None]:
logistic = LogisticRegression(C=1,max_iter=10000)

logistic.fit(X_train_onehot,y_train)

In [None]:
prediction = logistic.predict(X_test_onehot)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
cm_lgr1 = confusion_matrix(y_test,prediction) 
names = np.unique(prediction)
sns.heatmap(cm_lgr1, square=True, annot=True, cbar=False,xticklabels=names, yticklabels=names, cmap="YlGnBu" ,fmt='g')
plt.xlabel('Truth')
plt.ylabel('Predicted')

In [None]:
accuracy_score = accuracy_score(y_test,prediction)
print("Accuracy of  Logistic Regression :",accuracy_score)

# CALCULATING ACCURACY ON MANUAL FEATURES

As we already calculated the accuracy of model above,we will still calculate accuracy of the model on manual featuress which we extracted in the data cleaning part.
In this section we will test the model on 5 different MACHINE LEARNING MODELS to see whether the accuracy can be increased.

In [None]:
df_feat = df.drop(labels =['rating','title','review_text','branch','target','emails'], axis =1)

In [None]:
df_feat

In [None]:
y = df['target']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer= clean_text)
text_counts = cv.fit_transform(df['review_text'])

In [None]:
text_counts.toarray().shape

In [None]:
df_bagow = pd.DataFrame(text_counts.toarray(),columns = cv.get_feature_names())

In [None]:
df_bagow

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
sgd = SGDClassifier(n_jobs=-1,random_state=42,max_iter=200)
lgr = LogisticRegression(random_state=42,max_iter=200)
lgrcv = LogisticRegressionCV(cv=10,random_state=42,max_iter=1000)
svm = LinearSVC(random_state=42,max_iter=200)
rfc = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators=200)

In [None]:
clf = {'SGD':sgd, 'LGR':lgr, 'LGR-CV':lgrcv, 'SVM':svm, 'RFC':rfc}

In [None]:
clf.keys()

In [None]:
def classify(X,y):
    scaler = MinMaxScaler(feature_range=(0,1))
    X = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)
    for key in clf.keys():
        clf[key].fit(X_train, y_train)
        y_pred = clf[key].predict(X_test)
        ac = accuracy_score(y_test, y_pred)
        print(key, '---->', ac)

In [None]:
%%time
classify(df_bagow,y)

In [None]:
df_feat.head()

In [None]:
%%time
classify(df_feat,y)

# MANUAL FEATURES + BAG OF WORDS

In [None]:
X = df_feat.join(df_bagow)

In [None]:
X

In [None]:
%%time
classify(X,y)

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer = clean_text)
X = tfidf.fit_transform(df['review_text'])

In [None]:
%%time
classify(pd.DataFrame(X.toarray()),y)