In [1]:
!pip install nltk



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA
import re

In [3]:
nltk.download('stopwords')
porter_stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sounishnath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv(r"./train.tsv", sep="\t", usecols=["Phrase", "Sentiment"])

In [5]:
df["Phrase"] = df["Phrase"].astype(np.str)

In [6]:
df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [7]:
def preprocess_text (text):
    text = " ".join(text.split(" "))
    tokens = re.split("\W+", text.lower())
    tokens = [w for w in tokens if not w in stop_words]
    text = " ".join([porter_stemmer.stem(w) for w in tokens if not w in string.punctuation and w.isalpha() ])
    return text

In [8]:
df["stemmed"] = df["Phrase"].apply(preprocess_text)

In [9]:
df.head()

Unnamed: 0,Phrase,Sentiment,stemmed
0,A series of escapades demonstrating the adage ...,1,seri escapad demonstr adag good goos also good...
1,A series of escapades demonstrating the adage ...,2,seri escapad demonstr adag good goos
2,A series,2,seri
3,A,2,
4,series,2,seri


In [10]:
tfidf_vectorizer = TfidfVectorizer(
        stop_words=stop_words, 
        ngram_range=(1, 1),
        max_df=0.60,
        min_df=2,
        max_features=2000,
    )

In [11]:
stemmed_transformed = tfidf_vectorizer.fit_transform(df["stemmed"])

In [12]:
vectors = np.asarray( stemmed_transformed.todense() )

In [13]:
df["Sentiment"].unique()

array([1, 2, 3, 4, 0])

In [14]:
pca = PCA(n_components=50, random_state=42)

In [15]:
%%time
vv = pca.fit_transform(vectors)
vv

CPU times: user 1min 33s, sys: 7.37 s, total: 1min 40s
Wall time: 19.1 s


array([[-7.84604497e-03, -1.50104254e-02, -1.40385565e-03, ...,
         2.17950234e-03,  1.15091543e-03, -1.08622871e-02],
       [-6.40901112e-03, -1.13758912e-02, -1.10705090e-03, ...,
        -4.27874535e-03, -1.13712810e-03, -9.87729427e-03],
       [-9.61217892e-03, -1.98493738e-02, -1.06362269e-02, ...,
        -5.48398638e-03, -4.34576643e-03, -9.34262656e-03],
       ...,
       [-9.79631994e-03, -2.27641468e-02, -9.62730587e-03, ...,
         1.51770691e-06, -2.24791711e-04, -3.41138740e-03],
       [-9.79631994e-03, -2.27641468e-02, -9.62730587e-03, ...,
         1.51770691e-06, -2.24791711e-04, -3.41138740e-03],
       [-9.79631994e-03, -2.27641468e-02, -9.62730587e-03, ...,
         1.51770691e-06, -2.24791711e-04, -3.41138740e-03]])

In [16]:
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()
vv_scale = scalar.fit_transform(vv)

In [None]:
%%time
mlp_model = MLPClassifier(random_state=42)
mlp_model.fit(vv_scale, df["Sentiment"])
mlp_model.score(vv_scale, df["Sentiment"])

In [None]:
multinomial_model = MultinomialNB()
multinomial_model.fit(vv_scale, df["Sentiment"])
multinomial_model.score(vv_scale, df["Sentiment"])

In [None]:
random_forest_clf = RandomForestClassifier(max_depth=20, random_state=42, n_estimators=600)
random_forest_clf.fit(vv_scale, df["Sentiment"])
random_forest_clf.score(vv_scale, df["Sentiment"])

In [None]:
adaboosted_clf = AdaBoostClassifier(
    base_model=RandomForestClassifier(max_depth=20),
    n_estimators=600,
    learning_rate=1.5,
    algorithm="SAMME",
)
adaboosted_clf.fit(vv_scale, df["Sentiment"])
adaboosted_clf.score(vv_scale, df["Sentiment"])

In [None]:
test = df[df.index == 1032]
test

In [None]:
xx = test["Phrase"].values[0]
xx

In [None]:
%%time
testt = preprocess_text(xx)
test_vectors = tfidf_vectorizer.transform([testt]).todense()
test_vectors = pca.transform(test_vectors.copy())

In [None]:
mlp_model.predict(test_vectors)

In [None]:
import tensorflow as tf

In [None]:
tf_model = tf.keras.Sequential(layers=[
    tf.keras.layers.Dense(50, input_dim=50, activation="relu"),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(1, activation="relu"),
])

In [None]:
tf_model.compile(optimizer="adam", metrics=["precision", "recall", "mape"], loss=["mse"])

In [None]:
tf_model.fit(vv_scale, df["Sentiment"].values, epochs=2)

In [None]:
tf_model.predict(test_vectors)

In [None]:
tf_preds = np.round(tf_model.predict(vv_scale))

In [None]:
from sklearn.metrics import classification_report

classification_report(df["Sentiment"], tf_preds)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print(f'Accuracy: {accuracy_score(df["Sentiment"], tf_preds):.2f}')
print(f'Precision: {precision_score(df["Sentiment"], tf_preds):.2f}')
print(f'Recall: {recall_score(df["Sentiment"], tf_preds):.2f}')