In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
import langid
from multiprocessing import  Pool
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [4]:
def detect_lang(x):
    '''
    1. Takes the input and detects the language and assigns it to the lang
    2. Returns the first element in lang
    '''
    lang = langid.classify(x)
    return lang[0]

In [5]:
def load_review():
    '''
    1. Importing the csv file into a dataframe
    2. Dropping columns other than business_id, stars and text
    3. Applying the language detection on the text reviews
    4. Filter the dataframe based on language of text 
    5. Return the dataframe
    '''
    
    reviews = pd.read_csv('review.csv')
    
    reviews = reviews.loc[:, ["business_id", "stars", "text"]]

    reviews['lang'] = reviews['text'].apply(lambda row: detect_lang(row))
    
    review = reviews.loc[reviews['lang'] == 'en']
    
    return review

In [8]:
def check(x):
    '''
    1. Takes the input and checks if its positive or negative or neutral and assigns it to the label
    2. Returns the label
    '''
    label = 'positive' if x >= 0.0 else 'negative'
    return label

In [9]:
def add_label(x):
    '''
    Takes in variables, then performs the following:
    1. Assigns the varaibles to dataframe
    2. Calculates Polarity of each review and returns it to a new column in yelp dataframe
    3. Calculates Sentiment_lablel of each review and returns it to a new column in yelp dataframe
    4. Assigns the text, stars and sentiment_label to 2 variables and returns the variables
    '''
        
    yelp = x
    
    yelp['sentiment'] = yelp['text'].apply(lambda review: TextBlob(review).polarity)
    
    yelp['sentiment_label'] = yelp['sentiment'].apply(lambda row: check(row))
    
    yelp = yelp.dropna()
    
    yelp = yelp.loc[:, [ "text","sentiment_label"]]
    return yelp

In [10]:
def parallelize(df, func, n_cores=2):
    '''
    The dataframe is split into n_cores and availability_filter is performed on each sub dataframe
    n_cores: No of processors (96 in this env)
    n_cores is taken as 10 as it takes less time to split the given data.
    '''
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [11]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    '''
    import string
    
    text = text.lower()
    
    #remove punctuations
    text = [char for char in text if char not in string.punctuation]
    text = ''.join(text)
    
    #remove stopwords
    stops = set(stopwords.words("english"))
    stops.remove("not")
    
    text = text.split()
    text = [word for word in text if not word in stops]
    text = " ".join(text)
    
    return text

In [12]:
def clean_text(data):
    y_df = add_label(data)
    #applies the function on each value of the pandas series
    y_df['text'] = y_df['text'].apply(text_process)
    return y_df

In [14]:
def transform_to_features(data):
    '''
    Takes in variables, then performs the following:
    1. Create a vectorizer variable and assigns the scikit learn countvectorizer to it
    2. Converts the input variable into vectors using the vectorizer
    3. Converts the vectors to an array and assigns the array to a variable
    4. Returns the vector array variable
    '''
    from sklearn.feature_extraction.text import TfidfVectorizer
    # Create feature vectors
    vectorizer = TfidfVectorizer(min_df = 5,
                                 max_df = 0.8,
                                 sublinear_tf = True,
                                 use_idf = True)

    features = vectorizer.fit_transform(data)
    return features

In [15]:
def train_then_build_model(features_A,B):
    from sklearn.model_selection import train_test_split
    '''
    Takes in variables, then performs the following:
    1. Splits the variables into training and test sets in ratio of 80,20
    2. Fits the training and test data into a logistic regression model and starts trsining
    '''

    X_train, X_test, y_train, y_test  = train_test_split(features_A, B, train_size=0.80, random_state=1234)    
         
    from sklearn import svm
    # Perform classification with SVM, kernel=linear
    classifier_linear = svm.SVC(kernel='linear')
    classifier_linear.fit(X=X_train, y=y_train)
    pred = classifier_linear.predict(X_test)
    
    return pred,y_test

In [16]:
def test_metrics(p,q):
    '''
    Takes in variables, then performs the following:
    1. Calculate the accuracy, classification report of the model and prints it
    2. Prints the first 5 reviews thst were predicted correctly
    3. Prints the first 5 reviews that were not predicted correctly
    '''
    y_test = p
    pred = q
    
    print("Accuracy :: {}".format(round(accuracy_score(y_test,pred)*100,2)))
    print("Classification Report ::")
    print(classification_report(y_test,pred))

In [17]:
df = pd.read_pickle('cleaned_yelp_reviews.pickle')

In [19]:
features_A= transform_to_features(df["sentence"])

In [20]:
y_test,pred = train_then_build_model(features_A,df['sentiment'])

In [21]:
test_metrics(y_test,pred)

Accuracy :: 90.98
Classification Report ::
              precision    recall  f1-score   support

           0       0.74      0.85      0.79      1197
           1       0.96      0.92      0.94      4803

    accuracy                           0.91      6000
   macro avg       0.85      0.89      0.87      6000
weighted avg       0.92      0.91      0.91      6000

