In [1]:
#!pip install psycopg2

In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import warnings

import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

import spacy
import re

from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

warnings.filterwarnings('ignore')

## You can use any data that you like. There are many available text datasets on Kaggle, and you can use them if you like. Or, if you have access to substantial computing power, you can also use large collections of text. But it's a good idea to pick a modest-size dataset so that you can experiment a lot without waiting too much on running the computations.

## Your dataset can include nontext data along with some text data. However, having some text data in your dataset is obligatory.

## In your project, do the following:

## 1) First, make sure to clean your data. You can use the data-cleaning techniques that you practiced in the Text preprocessing checkpoint. But keep in mind that every text dataset is likely to have its own particular cleaning requirements. Make sure to clean your dataset appropriately.

## 2) If your dataset includes numerical features, then work on them for cleaning purposes; for example, deal with any missing values and outliers.

## 3) After the data-cleaning step, do some exploratory data analysis to get to know your dataset better. In the exploratory data analysis, analyze your numerical features as well as your text features. If you feel that converting your text features into a numerical form is required for your exploratory analysis, then you can do this step after vectorizing your text.

## 4) Convert your text features into numerical form. You're free to choose your method. You can use BoW, TF-IDF, word2vec, or any other method that you'd like to experiment with. It's a good idea to use several methods so that you can compare the results with respect to the methods that you apply.

## 5) You can choose whatever NLP task you want. You're completely free in your choice. Here are some possible options:
* Sentiment analysis
* Text classification
* Topic modeling
* Developing a more sophisticated chatbot than what you've already developed in this module
* Training a deep-learning model for a supervised or unsupervised task of your choice

--------------------------------------------------------------------------------

## IMDB movie reviews: 
https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [3]:
imdb = pd.read_csv('data/IMDB Dataset.csv')
imdb.columns = ['review', '-sentiment-']

In [4]:
imdb.head()

Unnamed: 0,review,-sentiment-
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review       50000 non-null  object
 1   -sentiment-  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
def clean_text(doc):
    # tokenize
    doc_words = word_tokenize(doc)

    # lowercase
    doc_lower = [word.lower() for word in doc_words]

    # remove stop words
    doc_nostop = [token for token in doc_lower if token not in stopwords.words('english')]

    # remove punctuation
    doc_nopunc = [word for word in doc_nostop if word.isalpha()]

    # stem the tokens
    stemmer = SnowballStemmer('english')
    doc_stem = [stemmer.stem(word) for word in doc_nopunc]

    return_doc = ' '.join(doc_stem)

    return return_doc

In [7]:
# Split into positive and negative documents
pos_doc = ''
neg_doc = ''
for index, row in imdb[:2000].iterrows():
    if row['-sentiment-'] == 'positive':
        pos_doc = pos_doc + row['review']
    else:
        neg_doc = neg_doc + row['review']

## Perform NLP, parse, and clean the text

In [8]:
# Parse the reviews
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 2000000
pos_doc = nlp(pos_doc)
neg_doc = nlp(neg_doc)

In [9]:
# Group into sentences
pos_sents = [[sent, "positive"] for sent in pos_doc.sents]
neg_sents = [[sent, "negative"] for sent in neg_doc.sents]

# Combine the sentences from the two docs into one DataFrame
sentences = pd.DataFrame(pos_sents[:2000] + neg_sents[:2000], columns = ["text", "-sentiment-"])

In [10]:
sentences.head()

Unnamed: 0,text,-sentiment-
0,"(One, of, the, other, reviewers, has, mentione...",positive
1,"(They, are, right, ,, as, this, is, exactly, w...",positive
2,"(first, thing, that, struck, me, about, Oz, wa...",positive
3,"(Trust, me, ,, this, is, not, a, show, for, th...",positive
4,"(This, show, pulls, no, punches, with, regards...",positive


In [11]:
# clean the text
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = " ".join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop and token.is_alpha])

## Use BOW to convert text feature

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word')
X = vectorizer.fit_transform(sentences["text"])
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
bow_sentences = pd.concat([bow_df, sentences[["text", "-sentiment-"]]], axis=1)

In [13]:
bow_sentences.shape

(4000, 7219)

## Analyze the BOW data

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = bow_sentences['-sentiment-']
X = np.array(bow_sentences.drop(['text','-sentiment-'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9084375

Test set score: 0.67
----------------------Random Forest Scores----------------------
Training set score: 0.9534375

Test set score: 0.64375
----------------------Gradient Boosting Scores----------------------
Training set score: 0.695625

Test set score: 0.5925


## Use TF-IDF to convert text feature

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_df=0.5, min_df=2, use_idf=True, norm=u'l2', smooth_idf=True)
X = vectorizer.fit_transform(sentences["text"])
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tfidf_sentences = pd.concat([bow_df, sentences[["text", "-sentiment-"]]], axis=1)

In [16]:
tfidf_sentences.head()

Unnamed: 0,aaliyah,abandon,abbot,abbreviate,abet,abhorrent,abide,ability,abject,able,...,ziyi,zombie,zombies,zone,zoo,zoom,zulu,zwick,text,-sentiment-
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,reviewer mention watch Oz episode hook,positive
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,right exactly happen,positive
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,thing strike Oz brutality unflinche scene viol...,positive
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,trust faint hearted timid,positive
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,pull punch regard drug sex violence,positive


## Analyze the TF-IDF data

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = tfidf_sentences['-sentiment-']
X = np.array(tfidf_sentences.drop(['text','-sentiment-'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9179166666666667

Test set score: 0.645
----------------------Random Forest Scores----------------------
Training set score: 0.9541666666666667

Test set score: 0.613125
----------------------Gradient Boosting Scores----------------------
Training set score: 0.7079166666666666

Test set score: 0.58625


## Use Word2Vec to convert text feature

In [18]:
import gensim

# Train word2vec on the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=0.5,
    window=12,
    sg=0,
    sample=0.001,
    size=100,
    hs=1
)

In [19]:
word2vec_arr = np.zeros((sentences.shape[0], 100))

for i, sentence in enumerate(sentences["text"]):
    word2vec_arr[i,:] = np.mean([model[lemma] for lemma in sentence], axis=0)

word2vec_arr = pd.DataFrame(word2vec_arr)
w2v_sentences = pd.concat([sentences[["-sentiment-", "text"]],word2vec_arr], axis=1)
w2v_sentences.dropna(inplace=True)

w2v_sentences.head()

Unnamed: 0,-sentiment-,text,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,positive,reviewer mention watch Oz episode hook,-0.074249,0.114671,0.015121,0.073763,0.069884,0.008495,0.053634,0.012058,...,0.178076,0.024288,-0.016427,0.016274,-0.06389,-0.139,0.106942,0.036321,-0.016587,-0.087598
1,positive,right exactly happen,-0.066062,0.178012,0.0085,0.106561,0.080411,-0.014588,0.107324,-0.001012,...,0.148205,0.034288,-0.035965,0.03302,-0.022341,-0.09926,0.208515,0.149304,-0.057306,-0.186135
2,positive,thing strike Oz brutality unflinche scene viol...,-0.069869,0.162468,0.028621,0.098696,0.072301,-0.001498,0.076688,0.024982,...,0.143718,0.040915,-0.018511,0.028187,-0.019662,-0.100381,0.110346,0.068416,-0.044475,-0.107071
3,positive,trust faint hearted timid,-0.127443,0.218737,0.056772,0.129232,0.085079,-0.003635,0.079976,0.024315,...,0.119139,0.043489,-0.01294,0.025341,0.014246,-0.057026,0.168648,0.102968,-0.070157,-0.163152
4,positive,pull punch regard drug sex violence,-0.007807,0.150145,-0.003704,0.085559,0.072203,-0.012184,0.10827,0.005073,...,0.142472,0.040177,-0.026639,0.0389,-0.023089,-0.104056,0.151252,0.110283,-0.04711,-0.138467


## Analyze the Word2Vec data

In [20]:
Y = w2v_sentences['-sentiment-']
X = np.array(w2v_sentences.drop(['text','-sentiment-'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.5386012715712988

Test set score: 0.5592643051771117
----------------------Random Forest Scores----------------------
Training set score: 0.9909173478655767

Test set score: 0.5504087193460491
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8578564940962761

Test set score: 0.5497275204359673
