<a href="https://colab.research.google.com/github/skauntey/ALMS-II-sentiment-analysis/blob/main/5_ALMS_Model2_250421.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -q kaggle

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os, shutil
import kaggle
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

## 2. Downloading kaggle competition files


In [None]:
dataset = "sentiment-analysis-on-movie-reviews"
def kaggle_dataset_download(dataset):
    kaggle.api.authenticate()
    kaggle.api.competition_download_files(dataset, path= str(os.getcwd())+"/dataset/", force = True)
    print ('Data downloaded!')

kaggle_dataset_download(dataset)

In [None]:
## In order to download the dataset make sure that kaggle.json file is available in ~/.kaggle/kaggle.json
# 'Kaggle.json' Downloading instructions are available under 'API Credential' heading of the github page
# https://github.com/Kaggle/kaggle-api

import zipfile
#Define file to download

dataset = "sentiment-analysis-on-movie-reviews"

def kaggle_dataset_download(dataset):
    kaggle.api.authenticate()
    kaggle.api.competition_download_files(dataset, path= str(os.getcwd())+"/dataset/", force = True)
    print ('Data downloaded!')

def unzip_kaggle_file():

    # file_path
    zipped_file = r'dataset/sentiment-analysis-on-movie-reviews.zip'
    file_path = os.path.join(str(os.getcwd()),zipped_file)
    folder_path = os.path.join(str(os.getcwd()), 'dataset')

    # making sure that previously downloaded file, if any, is deleted before
    for file in os.listdir(folder_path):
        path = os.path.join(os.getcwd(), 'dataset', file)
        if not path.split('.')[-1] == "zip":
          shutil.rmtree(path)
        else:
          continue

    # Unzipping the file
    with zipfile.ZipFile(file_path) as zip_file:
        for member in zip_file.namelist():
            if member.split('.')[-1] == "zip":
                fdir = member.split('.')[0]
                zip_file.extract(member, path= os.path.join('dataset/'+fdir))
                # extracting individual train and test files in their respective folders
                zippedfile_name = os.listdir(os.path.join(str(os.getcwd()+'/dataset/'+fdir+'/')))
                zippedfile = os.path.join(str(os.getcwd())+'/dataset/'+ fdir, zippedfile_name[0])
                with zipfile.ZipFile(zippedfile, mode='r') as tsv_zip:
                    tsv_zip.extractall(path = os.path.join('dataset/'+fdir+'/'))

                os.remove(zippedfile)
            else:
                continue

    os.remove(zipped_file)
    print('Kaggle file is downloaded and unzipped!')


kaggle_dataset_download(dataset)
unzip_kaggle_file()

# 3. Converting files to DataFrame

In [None]:
#C:\Users\eq\Documents\UCL\ELEC0135 ALMS II\ALMS-II-sentiment-analysis\dataset\train
file_path = os.path.join(str(os.getcwd()), 'dataset','train')
file = os.listdir(file_path)[0]
path = os.path.join(file_path, file)
# read the data
train_data = pd.read_csv(path, sep="\t")
label_class = [('negative', 0), ('somewhat negative', 1), ('neutral', 2), ('somewhat positive', 3), ('positive', 4)]

pdf = pd.DataFrame({
    'sentence_id' : train_data['SentenceId'],
    'text_reviews': train_data['Phrase'],
    'class_id': train_data['Sentiment'],
    'sentence_len': [len(x) for x in train_data['Phrase']],
    })

pdf['class_'] = [label_class[i][0] for i in pdf['class_id']]

pdf = pdf[pdf['sentence_len'] > 15]

pdf_data = pdf['text_reviews']
pdf_labels = pdf['class_id']

print(pdf_data.shape)
print(pdf_labels.shape)
pdf.head(10)

## 3.1 Cleaning data

In [None]:
import re
from sklearn.model_selection import train_test_split
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

def clean_text_func(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

def preprocessing_dataset(pdf_data, pdf):
    pdf_data = pdf_data.apply(clean_text_func)
    text_dataset = pdf[pdf_data.astype(bool)] 
    pdf = text_dataset.reset_index(drop = True)

    text_data = pdf['text_reviews']
    pdf_text_data = text_data.apply(clean_text_func)
    pdf['text_reviews'] = pdf_text_data
    return pdf

pdf = preprocessing_dataset(pdf_data,pdf)
pdf.head()
pdf.info()

#### 3.1 Viewing data

In [None]:
sen_numbers = [x for x in set(pdf['sentence_id'])]
def de_agument_text(pdf):
  strings  = []
  for i in sen_numbers:
    anew = pdf.loc[pdf['sentence_id'] == i].sort_values('sentence_len', ascending=False)
    seriess = anew[['text_reviews', 'class_id', 'sentence_len', 'class_']].iloc[0,0:4].values
    strings.append(seriess)
  return strings

pdf_compressed = pd.DataFrame(de_agument_text(pdf), columns=['text_reviews', 'class_id', 'sentence_len', 'class_'])
pdf_compressed

In [None]:
print(pdf.shape)
print(pdf_compressed.shape)

In [None]:
import matplotlib.pyplot as plt

labels = {'negative':0,'somewhat negative':1,'neutral': 2,'somewhat positive':3,'positive':4}
x = labels.keys()

y_compressed = pdf_compressed.groupby('class_id').text_reviews.count()

y_pdf = pdf.groupby('class_id').text_reviews.count()


fig, [ax1, ax2]= plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

ax1.bar(x, y_compressed)
ax1.set_xticklabels(labels = x, rotation =45)
ax1.set_xlabel('Filtered Dataset: Class distribution', fontsize = 14)
ax1.set_ylabel('Number of reviews in each class', fontsize = 14)

ax2.bar(x, y_pdf)
ax2.set_xticklabels(labels = x, rotation =45)
ax2.set_xlabel('Original Dataset: Class distribution', fontsize = 14)
ax2.set_ylabel('Number of reviews in each class', fontsize = 14)


plt.show()



In [None]:
from io import StringIO
id_class_df = pdf[['class_id', 'class_']].drop_duplicates().sort_values('class_id')
id_class = dict(id_class_df.values)
class_id_dict = dict(id_class_df[['class_', 'class_id']].values)
class_id_dict

#Text Representation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pdf['text_reviews'], pdf['class_id'],test_size=0.1, random_state = 0)
X_train_reduced_text, X_test_reduced_text, y_train_reduced_text, y_test_reduced_text = train_test_split (pdf_compressed['text_reviews'], pdf_compressed['class_id'],test_size=0.1, random_state = 0)

print ("Numpy array of Original Dataset")
X_corpus = pdf.text_reviews.to_numpy()
y_labels = pdf.class_id.to_numpy()
X_corpus_smallds = pdf_compressed.text_reviews.to_numpy()
y_labels_smallds = pdf_compressed.class_id.to_numpy()

print ("Training/Testing set from original Dataset")
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print ("Training/Testing set from redused Dataset")
X_train_reduced = X_train_reduced_text.to_numpy()
X_test_reduced = X_test_reduced_text.to_numpy()
y_train_reduced = y_train_reduced_text.to_numpy()
y_test_reduced = y_test_reduced_text.to_numpy()



In [None]:
#Verifying whether data is in the correct format

print ("Info of Original Dataset")
print(type(X_corpus_smallds))
print(type(y_labels_smallds))
print(X_corpus.shape)
print(y_labels.shape)

print(type(X_corpus))
print(type(y_labels))
print(X_corpus.shape)
print(y_labels.shape)


print ("Training/Testing set from the original Dataset")

print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print ("Training/Testing set from the original Dataset")

print(type(X_train_reduced))
print(type(X_test_reduced))
print(type(y_train_reduced))
print(type(y_test_reduced))

print(X_train_reduced.shape)
print(X_test_reduced.shape)
print(y_train_reduced.shape)
print(y_test_reduced.shape)



# Linear SVC

In [None]:
# Creating Pipeline
# Model LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=False, norm='l2', encoding='latin-1', stop_words='english')),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
        #('RandomF', RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0)),
        #('NB', MultinomialNB()),
        #('clf', LinearSVC(C=1000)),
        #('LR', LogisticRegression(max_iter = 1000, solver = 'lbfgs', random_state=0))
    ])

parameters = {
        'vect__ngram_range': [(1, 1), (1,2)],
    }

#Fit the pipeline on the training set using grid search for the parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)

# train the training set
grid_search.fit(X_train, y_train)

#make the prediction
y_predicted = grid_search.predict(X_test)

#Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                        target_names=pdf['class_'].unique()))

# Print and plot the confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = metrics.confusion_matrix(y_test, y_predicted)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=id_class_df.class_.values, yticklabels=id_class_df.class_.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()



# Multinomial NB

In [None]:
# Creating Pipeline
# Model LinearSVC

pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True, norm='l2', encoding='latin-1', stop_words='english')),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
        #('RandomF', RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0)),
        #('NB', MultinomialNB()),
        #('clf', LinearSVC(C=1000)),
        #('LR', LogisticRegression(max_iter = 1000, solver = 'lbfgs', random_state=0))
    ])

parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
    }

#Fit the pipeline on the training set using grid search for the parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)

# train the training set
grid_search.fit(X_train_reduced, y_train_reduced)

#make the prediction
y_predicted_r = grid_search.predict(X_test_reduced)

#Print the classification report
print(metrics.classification_report(y_test_reduced, y_predicted_r,
                                        target_names=pdf['class_'].unique()))

# Print and plot the confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = metrics.confusion_matrix(y_test_reduced, y_predicted_r)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=id_class_df.class_.values, yticklabels=id_class_df.class_.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1,2), stop_words='english')

features = tfidf.fit_transform(X_corpus_smallds).toarray()
labels = y_labels_smallds
features.shape

In [None]:
from sklearn.feature_selection import chi2
import numpy as np

N = 3

for class_id, class_ in sorted(id_class.items()):
  features_chi2 = chi2(features, labels == class_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]

  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  #trigrams = [v for v in feature_names if len(v.split(' ')) == 3]

  print("# '{}':".format(class_))
  print ('=' * 50)
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print ('-' * 30)
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
  #print ('-' * 30)
  #print("  . Most correlated trigrams:\n. {}".format('\n. '.join(trigrams[-N:])))

### Verifying random text using multinomialNB Classifier

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
sample = pdf_compressed.sample().to_numpy()
sample[0][0]

In [None]:
clf.predict(count_vect.transform([sample[0][0]]))

In [None]:
pdf_compressed[pdf_compressed['text_reviews'] == sample[0][0]]