In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
print(data.head())
size = data.shape[0]
data = data[:int(size/10)]

In [None]:
# Tokenise - find a more efficient way (spacy, textblob)
# Does stop word removal increase the accuracy?
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import string
def preprocessor(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
data['review']=data['review'].apply(preprocessor)
data.head()

In [None]:
print("Shape: ", data.shape)
print()
print("Info: ",data.info())
print()
print("Counts: ", data.sentiment.value_counts())

In [None]:
# Label encoding just changes the classification to a numerical value that is easier to work with as compared to a word/phrase
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["sentiment"] = le.fit_transform(data["sentiment"])
data.head()

# Some of the data will be used to train and some of the data will be used to test how well the model is doing
from sklearn.model_selection import train_test_split
y=data['sentiment']
x=data['review']
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.2)
print("Train: ", x_train.shape)
print("Test: ", x_test.shape)

In [None]:
# Bag of words with count vectoriser
from sklearn.feature_extraction.text import CountVectorizer
vectCount=CountVectorizer(ngram_range=(1,2))
x_train_trans_Count=vectCount.fit_transform(x_train)

# Classifiers

***Aim:*** Check the perfomance of the models individually for reference to a baseline accuracy that can be acheived and to compare whether the ensemble method makes an imporvement.

### 1) Logistic regression: 
Suitable for a binary classifier (reviews are either positive or negative).

### 2) Decision Tree:
Classified more intractely at each stange analysing the data for a feature then sending it down to the relevant sub-category.


In [None]:
#CLASSIFIER PIPLINES - logistic regression and decision tree for a good variability in outcome

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

lr = Pipeline([('lrmodel', LogisticRegression())])
lr.fit(x_train_trans_Count,y_train)
pred_y=lr.predict(vectCount.transform(x_test))
score_lr=accuracy_score(y_test,pred_y)
report_lr = classification_report(y_test,pred_y)
print("Logistic regression")
print("Accuracy: ", score_lr)
print(report_lr)

dt = Pipeline([('dtmodel', DecisionTreeClassifier())])
dt.fit(x_train_trans_Count,y_train)
pred_y=dt.predict(vectCount.transform(x_test))
score_dt=accuracy_score(y_test,pred_y)
report_dt = classification_report(y_test,pred_y)
print("Decision tree")
print("Accuracy: ", score_dt)
print(report_dt)

# Ensemble methods

### *Parallel ( Models are independent of one another ) :*

### Simple voting classifier:
Hard voting - Each model's prediction is weighted equally, the most common predicted output is used.
Soft voting - The probability of each prediction being correct is assigned as the weight and these are averaged to find the most relaible predcition.
Overall provides impoves stability and predictive performance by combiing multiple different learning algorithms and aggregating the results.

### Stacking:
Similar to voting as different models are used however, each model isnt neccesiarly weighted equally. All outputs are put through a final algorithm to then learn when to trust certain outputs or models more or less.
When multiple base learner models are useful for a problem but in different ways stacking allows for improved performance with a meta learner. 

### Bagging:
Generate bootstrap samples, build and fit weak learners on each sample and take the average of all the predictions to produce a final output.
This method reduces the variance and produces a more generalise and consistent output.

### Random forest:
Uses a diverse range of decision trees looking for different sets of features then votes on all the outputs to provide one.
This technique is effective in overcoming the issue decion trees can have of being overfitted to the training data and creates a more generalised algorithm.

### *Sequential ( Models are dependent upon one another ) :*

### Boosting:
In boosting the training set aims on focusing on previosuly misclassified data from the last model. Typically uses very/fairly weak learners.
Gradient boosting (and extreme gradient boosting) - focuses on reducing the loss function overall.
Adaptive boosting (AdaBoost) - Considers how miscalculated a model is and then assigns a weight and attempts to optimise the weighted sum of errors (developed from gradient boosting and tends to work very well).
Boosting is aimed at reducing the overall bias a dataset may have by focusing on the areas that are often misclassified, if precision is a priority it is a good option.

In [None]:
#VOTING CLASSIFIER - quick, simple way to produce increased relaibility.

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

pipe1 = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(C=1000))

pipe2 = make_pipeline(CountVectorizer(ngram_range=(1,2)), DecisionTreeClassifier(max_depth=6))
              
votingClassifier = VotingClassifier(estimators=[('p1', pipe1), ('p2', pipe2)])
votingClassifier.fit(x_train, y_train)
y_predicted = votingClassifier.predict(x_test)
print("Hard voting:")
print("Accuracy: ", accuracy_score(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

votingClassifier = VotingClassifier(estimators=[('p1', pipe1), ('p2', pipe2)], voting ='soft')
votingClassifier.fit(x_train,y_train)
y_predicted =votingClassifier.predict(x_test)
print("Soft voting:")
print("Accuracy: ", accuracy_score(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

In [None]:
# STATCKING

from sklearn.ensemble import StackingClassifier

base_learners = [
    ('p1', pipe1), 
    ('p2', pipe2)
]

stack = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression())
stack.fit(x_train,y_train)
y_predicted = stack.predict(x_test)
print("Stacking:")
print("Accuracy: ", accuracy_score(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

In [None]:
# BAGGING

from sklearn.ensemble import BaggingClassifier

bag = BaggingClassifier(LogisticRegression(C=1000))

bag.fit(vectCount.transform(x_train),y_train)
y_predicted = bag.predict(vectCount.transform(x_test))
print("Bagging:")
print("Accuracy: ", accuracy_score(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

In [None]:
# Boosting

from sklearn.ensemble import AdaBoostClassifier

boosting = AdaBoostClassifier(
    LogisticRegression(C=1000),
    n_estimators=200
)

boosting.fit(vectCount.transform(x_train),y_train)
y_predicted = boosting.predict(vectCount.transform(x_test))
print("Boosting:")
print("Accuracy: ", accuracy_score(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

In [None]:
# IPYNB in vscode and push repo to azure for voting classifier progress 
# Data visualisation
# Reduce dataset size for more variance and redo gridsearch - function 