In [None]:
# import libraries for data parasing
import pandas as pd
import numpy as np
import random as rnd
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import re
from scipy.sparse import hstack
from scipy.stats import uniform


# import plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# import ML models
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
train = pd.read_csv('./data/train (3) (1) (3) (2).csv')
test = pd.read_csv('./data/test (3) (1) (3) (2).csv')

In [None]:
test.head()



In [None]:
# test data set is not useful as it has no Star Rating, hence strategy is split training data into 80:20

In [None]:
train.head()

In [None]:
# fill nan with None and remove the unnecessory columns as app version and version name
train = train.fillna('None')
#test = test.fillna('None')  test doesnt have Star Rating, it wont be useful, hence we split the training data itself.
train = train[['id','Review Text','Review Title','Star Rating']]
#test = test[['id','Review Text','Review Title','Star Rating']]
train.head()



In [None]:
# Text Cleaning function
def clean_text(x):
    x = str(x)
    punctuation_replacement = {'`': "'", '′': "'", '“':'"', '”': '"', '‘': "'"}

    charector_stripping = [',', '.', '"', ':', ')', '(', '-', '|', ';', "'", '[', ']', '>', '=', '+', '\\', '•',  '~', '@', 
                 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
                 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
                 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
                 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

    punctuation = ['!', '?', '$', '&', '/', '%', '#', '*','£']
    
    x = x.lower()
    
    x = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", x)
    
    for k, v in punctuation_replacement.items():
        x = x.replace(k, f' {v} ')
        
    for punct in charector_stripping:
        x = x.replace(punct, ' ') 
    
    for punct in punctuation:
        x = x.replace(punct, f' {punct} ')
        
    x = x.replace(" '", " ")
    x = x.replace("' ", " ")
        
    return x

In [None]:
# clean all special charectors
train['Review Text']= train['Review Text'].apply(clean_text) 
train['Review Title']= train['Review Title'].apply(clean_text)
train.index = train['id']

In [261]:
X_train, X_test, y_train, y_test = train_test_split(train[['Review Text','Review Title']],train['Star Rating'],test_size=0.2, random_state=42)
train_text1 = X_train['Review Text']
train_text2 = X_train['Review Title']
train_text3 = X_train['Review Text']+X_train['Review Title']

test_text1 = X_test['Review Text']
test_text2 = X_test['Review Title']
test_text3 = X_test['Review Text']+' '+X_test['Review Title']
print(test_text3[0:10])

all_text=pd.concat([train_text1,train_text2,test_text1,test_text2,train_text3])

id
7af38198-2aee-4d54-8666-c3569c87c314                                        good app none
b2dd8447-23dc-4da5-8753-135ea65c7169    very bad    whenever you want to do a recharge...
8287a8fd-e68e-4bce-a119-c57dd19175f6    uninstalled this will not install again kindly...
d346edf6-c150-412a-9f3f-157063ce2a05                                 very useful app none
ffc593ae-07ed-44c2-abc4-f9c2033b0a58    a very good innovation  there are places for i...
7e4865ff-a38e-451a-8f05-ab4133df4b0f    it s worst experience from this app  because r...
1b1a0fe9-0194-43d2-9b32-b61c83349371                                        good one none
e3e6e6c4-b821-42d6-8fb2-50d427ac3c75                                         useless none
11bfbc3a-79dd-49f6-922d-573ef5973ad8    one of the best app for every paytment thank u...
f7b29b19-b89c-4c13-8f41-1ad4e18e1776    i want to pay my electricity bill   but i alre...
dtype: object


In [263]:
def predict_one(x,y,xt):
    
    # Create logistic regression
    logistic = LogisticRegression(n_jobs=10,solver='saga',multi_class='multinomial',class_weight='balanced')
    # Create regularization penalty space
    penalty = ['l1', 'l2']

    # Create regularization hyperparameter distribution using uniform distribution
    C = [1, 30]
   
    # Create hyperparameter options
    hyperparameters = dict(C=C, penalty=penalty)
   
    
    clf = GridSearchCV(logistic, hyperparameters, cv=5)
    #clf = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)
    
    # Fit randomized search
    best_model = clf.fit(x, y)
    y_pred = best_model.predict(xt)
    
    #y_pred = best_model.predict_prob(x)
    #idx=list(best_model.classes_).index(1)
    
    print(best_model.best_estimator_.get_params())
    return y_pred#[:,idx]
    


In [264]:
# initialize the word vectorizer
word_vectorizer= TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1,2),
    max_features=100000)
word_vectorizer.fit(all_text)

train_word_features1  = word_vectorizer.transform(train_text1)
train_word_features2  = word_vectorizer.transform(train_text2)
train_word_features3  = word_vectorizer.transform(train_text3)

test_word_features1  = word_vectorizer.transform(test_text1)
test_word_features2  = word_vectorizer.transform(test_text2)
train_word_features3  = word_vectorizer.transform(train_text3)

In [265]:
# initialize the char vectorizer
charector_vectorizer= TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2,6),
    max_features=1000000)
charector_vectorizer.fit(all_text)

train_char_features1  = charector_vectorizer.transform(train_text1)
train_char_features2  = charector_vectorizer.transform(train_text2)
train_char_features3  = charector_vectorizer.transform(train_text3)

test_char_features1  = charector_vectorizer.transform(test_text1)
test_char_features2  = charector_vectorizer.transform(test_text2)
test_char_features3  = charector_vectorizer.transform(test_text3)

In [257]:
# stack the features
train_char_features=hstack([train_char_features1, train_char_features2])
test_char_features=hstack([test_char_features1,test_char_features2])

train_word_features=hstack([train_word_features1,train_word_features2])
test_word_features=hstack([test_word_features1,test_word_features2])


train_features = hstack([train_char_features1, train_word_features1,train_char_features2, train_word_features2])
test_features = hstack([test_char_features1, test_word_features1,test_char_features2, test_word_features2])



In [258]:
sub_char = pd.DataFrame({'id': train['id']})
sub_word = pd.DataFrame({'id': train['id']})
sub_joint = pd.DataFrame({'id': train['id']})
sub_mean = pd.DataFrame({'id': train['id']})
sub_max = pd.DataFrame({'id': train['id']})
y=y_train

In [259]:
y_pred_char = predict_one(train_char_features, y, test_char_features)
y_pred_word = predict_one(train_word_features, y, test_word_features)
y_pred_joint = predict_one(train_features, y, test_features)
    

KeyboardInterrupt: 

In [None]:
#for t in class_names:
#    y = train1[t].values
#    y_pred_char = predict_one(train_char_features, y, test_char_features)
#    y_pred_word = predict_one(train_word_features, y, test_word_features)
#    y_pred_joint = predict_one(train_features, y, test_features)
    
#    sub_char[t] = y_pred_char
#    sub_word[t] = y_pred_word
#    sub_joint[t] = y_pred_joint #best
#   sub_mean[t] = 0.5 * (y_pred_char + y_pred_word)
#    sub_max[t] = np.maximum(y_pred_char, y_pred_word)
#    print('predict {}'.format(t))

In [None]:
print(f1_score(y_pred_char, y_test,average='weighted'))
print(f1_score(y_pred_word, y_test,average='weighted'))
print(f1_score(y_pred_joint, y_test,average='weighted'))