In [2]:
!pip install nltk



In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
import re

In [4]:
nltk.download('stopwords')
porter_stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sounishnath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df = pd.read_csv(r"./train.tsv", sep="\t", usecols=["Phrase", "Sentiment"])

In [6]:
df["Phrase"] = df["Phrase"].astype(np.str)

In [6]:
df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [7]:
def preprocess_text (text):
    text = " ".join(text.split(" "))
    tokens = re.split("\W+", text.lower())
    tokens = [w for w in tokens if not w in stop_words]
    text = " ".join([porter_stemmer.stem(w) for w in tokens if not w in string.punctuation and w.isalpha() ])
    return text

In [8]:
df["stemmed"] = df["Phrase"].apply(preprocess_text)

In [9]:
df.head()

Unnamed: 0,Phrase,Sentiment,stemmed
0,A series of escapades demonstrating the adage ...,1,seri escapad demonstr adag good goos also good...
1,A series of escapades demonstrating the adage ...,2,seri escapad demonstr adag good goos
2,A series,2,seri
3,A,2,
4,series,2,seri


In [9]:
tfidf_vectorizer = TfidfVectorizer(
        stop_words=stop_words, 
        ngram_range=(1, 1),
        max_df=0.60,
        min_df=2,
        max_features=2000,
    )

In [10]:
stemmed_transformed = tfidf_vectorizer.fit_transform(df["stemmed"])

In [11]:
vectors = np.asarray( stemmed_transformed.todense() )

In [12]:
df["Sentiment"].unique()

array([1, 2, 3, 4, 0])

In [27]:
pca = PCA(n_components=50, random_state=42)

In [28]:
%%time
vv = pca.fit_transform(vectors)
vv

CPU times: user 1min 37s, sys: 6.23 s, total: 1min 43s
Wall time: 18.3 s


array([[-7.84604497e-03, -1.50104254e-02, -1.40385565e-03, ...,
         2.17950234e-03,  1.15091543e-03, -1.08622871e-02],
       [-6.40901112e-03, -1.13758912e-02, -1.10705090e-03, ...,
        -4.27874535e-03, -1.13712810e-03, -9.87729427e-03],
       [-9.61217892e-03, -1.98493738e-02, -1.06362269e-02, ...,
        -5.48398638e-03, -4.34576643e-03, -9.34262656e-03],
       ...,
       [-9.79631994e-03, -2.27641468e-02, -9.62730587e-03, ...,
         1.51770691e-06, -2.24791711e-04, -3.41138740e-03],
       [-9.79631994e-03, -2.27641468e-02, -9.62730587e-03, ...,
         1.51770691e-06, -2.24791711e-04, -3.41138740e-03],
       [-9.79631994e-03, -2.27641468e-02, -9.62730587e-03, ...,
         1.51770691e-06, -2.24791711e-04, -3.41138740e-03]])

In [30]:
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()
vv_scale = scalar.fit_transform(vv)

In [17]:
%%time
mlp_model = MLPClassifier(random_state=42)
mlp_model.fit(vv_scale, df["Sentiment"])
mlp_model.score(vv_scale, df["Sentiment"])

CPU times: user 11min 29s, sys: 3min 50s, total: 15min 20s
Wall time: 2min 6s




0.5460335768294245

In [18]:
multinomial_model = MultinomialNB()
multinomial_model.fit(vv_scale, df["Sentiment"])
multinomial_model.score(vv_scale, df["Sentiment"])

0.5099448929898757

In [19]:
%%time
random_forest_clf = RandomForestClassifier(max_depth=20, random_state=42, n_estimators=600)
random_forest_clf.fit(vv_scale, df["Sentiment"])
random_forest_clf.score(vv_scale, df["Sentiment"])

KeyboardInterrupt: 

In [21]:
%%time
adaboosted_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=600,
)
adaboosted_clf.fit(vv_scale, df["Sentiment"])
adaboosted_clf.score(vv_scale, df["Sentiment"])

KeyboardInterrupt: 

In [15]:
test = df[df.index == 1032]
test

Unnamed: 0,Phrase,Sentiment,stemmed
1032,gives new meaning to the phrase ` fatal script...,0,give new mean phrase fatal script error


In [18]:
xx = test["Phrase"].values[0]
xx

"gives new meaning to the phrase ` fatal script error . '"

In [36]:
%%time
testt = preprocess_text(xx)
test_vectors = tfidf_vectorizer.transform([testt]).todense()
test_vectors = pca.transform(test_vectors.copy())

CPU times: user 3.58 ms, sys: 2.65 ms, total: 6.23 ms
Wall time: 5.53 ms




In [25]:
mlp_model.predict(test_vectors)

array([4])

In [20]:
import tensorflow as tf

In [31]:
tf_model = tf.keras.Sequential(layers=[
    tf.keras.layers.Dense(50, input_dim=50, activation="relu"),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(1, activation="relu"),
])

In [32]:
tf_model.compile(optimizer="adam", metrics=["mape"], loss=["mse"])

In [33]:
tf_model.fit(vv_scale, df["Sentiment"].values, epochs=2)

2021-11-27 14:17:23.855063: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-27 14:17:23.855679: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-11-27 14:17:23.973451: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x147aea7f0>

In [37]:
tf_model.predict(test_vectors)

2021-11-27 14:18:32.535808: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


array([[1.6192502]], dtype=float32)

In [38]:
tf_preds = np.round(tf_model.predict(vv_scale))

In [39]:
from sklearn.metrics import classification_report

classification_report(df["Sentiment"], tf_preds)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.00      0.00      0.00      7072\n           1       0.34      0.01      0.03     27273\n           2       0.52      0.99      0.68     79582\n           3       0.42      0.04      0.07     32927\n           4       1.00      0.00      0.00      9206\n\n    accuracy                           0.51    156060\n   macro avg       0.46      0.21      0.16    156060\nweighted avg       0.47      0.51      0.37    156060\n'

In [45]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print(f'Accuracy: {accuracy_score(df["Sentiment"], tf_preds):.2f}')
print(f'Precision: {precision_score(df["Sentiment"], tf_preds, average="micro"):.2f}')
print(f'Recall: {recall_score(df["Sentiment"], tf_preds, average="micro"):.2f}')

Accuracy: 0.51
Precision: 0.51
Recall: 0.51
