<a href="https://colab.research.google.com/github/seanmcalevey/kaggle_challenges/blob/master/imdb_sent_clf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.0 Basic Imports and Loading Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

drive.mount('/content/drive/')

  import pandas.util.testing as tm


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


### Import Training DF

In [2]:
drive_path = '/content/drive/My Drive/'
folder_path = 'kaggle_datasets/imdb_dataset/'
file_path = 'labeledTrainData2.tsv'

master_df = pd.read_csv(drive_path + folder_path + file_path, delimiter='\t')
df = master_df.copy()
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


# 2.0 Text Cleaning

## 2.1 Clean Text Func

In [3]:
def clean_text(text_list):

  import re
  import sys
  sys.path.append('/content/drive/My Drive/py_functions')
  from summarizer_func_lib import replace_contraction
  sys.path.remove('/content/drive/My Drive/py_functions')
  from bs4 import BeautifulSoup

  cleaned_reviews = []
  for review in text_list:
    cleaned_review = re.sub('[^A-Za-z.,\s\']', '', review)
    cleaned_review = BeautifulSoup(cleaned_review, 'lxml').get_text() # clean all html from text
    # cleaned_review = lemmatize_func(cleaned_review)

    # Split off periods as their own tokens, and search for contractions and replace them
    review_words = []
    for word in cleaned_review.split():
      word = word.lower()
      if re.search('\w+\.', word):
        word = re.sub('\.', '', word)
        proc_words = replace_contraction(word)
        for w in proc_words.split():
          review_words.append(w)
        review_words.append('.')
      elif re.search('\w+,', word):
        word = re.sub(',', '', word)
        proc_words = replace_contraction(word)
        for w in proc_words.split():
          review_words.append(w)
      elif re.search('br$', word):
        pass
      else:
        proc_words = replace_contraction(word)
        for w in proc_words.split():
          review_words.append(w)
    
    # Combine words together and append
    review_words = ' '.join(review_words)
    cleaned_reviews.append(review_words)

  # cleaned_reviews = clean_stops(cleaned_reviews)
  return cleaned_reviews

# Stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def clean_stops(text_list):
  stop_words = stopwords.words('english')
  cleaned_reviews = []
  for review in text_list:
    tok_review = review.split()
    proc_review = [word for word in tok_review if word not in stop_words]
    proc_review = ' '.join(proc_review)
    cleaned_reviews.append(proc_review)
  return cleaned_reviews


# Lemmatizer
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

def lemmatize_func(text, pos_tag='v'):
  lem = WordNetLemmatizer()
  lems = [lem.lemmatize(word, pos_tag) for word in text.split()]
  return ' '.join(lems)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Clean Text and Add Back to DF

In [4]:
text_list = df['review']
cleaned_text = clean_text(text_list)
df['lemm_text'] = cleaned_text
df.head()

Unnamed: 0,id,sentiment,review,lemm_text
0,5814_8,1,With all this stuff going down at the moment w...,with all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",the classic war of the worlds by timothy hines...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,the film starts with a manager nicholas bell g...
3,3630_4,0,It must be assumed that those who praised this...,it must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious '...


## 2.2 Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

X = df['lemm_text']
y = df['sentiment']

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, stratify=y_train_full, test_size=0.15, random_state=42)

## 2.3: Linear SVC Model with TfidfVectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

max_feats = 40000
pipe = make_pipeline(TfidfVectorizer(ngram_range=(1,3), stop_words='english', max_features=max_feats), LinearSVC())
param_grid = {'linearsvc__C': [0.6]}
grid = GridSearchCV(pipe, param_grid, cv=5).fit(X_train, y_train)
print(f'Best params: {grid.best_params_} \nBest score: {grid.best_score_}')

Best params: {'linearsvc__C': 0.6} 
Best score: 0.8852941176470589


### LinearSVC on Full Training Set

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words='english', max_features=max_feats)
X_train_vec = vectorizer.fit_transform(X_train_full)
X_test_vec = vectorizer.transform(X_test)

clf = LinearSVC(C=0.6).fit(X_train_vec, y_train_full)
clf.score(X_train_vec, y_train_full), clf.score(X_test_vec, y_test)

(0.9918, 0.8922)

## 2.4 Import Test DF

In [14]:
file_path = 'testData.tsv'
test_df = pd.read_csv(file_path, delimiter='\t')

## 2.5 Train on Full Train Dataset, and Predict on Test Dataset

In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words='english', max_features=max_feats)
X_vec = vectorizer.fit_transform(X)

clf = LinearSVC(C=0.6).fit(X_vec, y)
clf.score(X_vec, y)

0.98916

### Predict

In [29]:
X = test_df['review']
X_clean = clean_text(X)
X_vec = vectorizer.transform(X_clean)
y_pred = clf.predict(X_vec)

test_df['sentiment'] = y_pred
output_df = test_df.set_index('id', drop=True)
output_df = output_df[['sentiment']]
output_df.head()

Unnamed: 0_level_0,sentiment
id,Unnamed: 1_level_1
12311_10,1
8348_2,0
5828_4,0
7186_2,0
12128_7,1


## 2.6 Save Result to CSV

In [31]:
odrive_path = '/content/drive/My Drive/'
folder_path = '/kaggle_datasets/imdb_dataset/'
file_path = 'test_pred.csv'

output_df.to_csv(drive_path + folder_path + file_path, header=True)