In [12]:
import pandas as pd
import numpy as np
import os
import spacy
from IPython.display import display
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


!pip install pysentiment2

import pysentiment2 as ps 



In [2]:
def calculate_polarity_subjectivity(df):
    lm = ps.LM()
    hiv4 = ps.HIV4()
    polarity_array = []
    subjectivity_array = []
    hiv_polarity = []
    hiv_subjectivity = []
    count=0
    count1=0
    for x in range(len(df['filteredtext'])):
        tokens_m = lm.tokenize(df['filteredtext'][x])
        score_m = lm.get_score(tokens_m)
        polarity_array.append(score_m['Polarity'])
        subjectivity_array.append(score_m['Subjectivity'])
        tokens_hiv = hiv4.tokenize(df['filteredtext'][x])
        score_hiv = hiv4.get_score(tokens_hiv)
        hiv_polarity.append(score_hiv['Polarity'])
        hiv_subjectivity.append(score_hiv['Subjectivity'])
        if score_m['Polarity']*score_hiv['Polarity']<0:
            count+=1
    feature_df = pd.DataFrame()
    feature_df['Mcdonald_Polarity'] = polarity_array
    feature_df['Mcdonald_Subjectivity'] = subjectivity_array
    feature_df['HIV_Polarity'] = hiv_polarity
    feature_df['HIV_Subjectivity'] = hiv_subjectivity
    return feature_df

In [3]:
def get_glove_embeddings(df):
        word_list = []
        for i in df['filteredtext']:
            x = i[1:-1].split(", ")
            words = []
            for j in x:
                s = j.split(" ")
                for k in s:
                    words.append(k)
            word_list.append(words)
        filename = './data/glove.6B.100d.txt.word2vec'
        model = KeyedVectors.load_word2vec_format(filename, binary=False)


        embedding_list = []
        for i in word_list:
            embeddings = []
            for j in i:
                try:
                    glov = model[j]
                    embeddings.append(glov)
                except:
                    continue
            embedding_list.append(embeddings)
        return embedding_list

In [13]:
def tfIDFvectorization(input_df):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(list(input_df["filteredtext"]))
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    normalize(df)
    return df

In [14]:
def logisticRegression():
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    # print(classification_report(y_test, clf.predict(X_test)))
    # print("LR Training Score: ",clf.score(X_train,y_train))
    # print("LR F1 Score:",f1_score(y_test,clf.predict(X_test),zero_division=0))
    # print("LR Accuracy:",accuracy_score(y_test,clf.predict(X_test)))
    return accuracy_score(y_test,clf.predict(X_test))

In [15]:
def naiveBayes():
    clf_NB = GaussianNB()
    clf_NB.fit(X_train,y_train)
    # print(classification_report(y_test, clf_NB.predict(X_test)))
    # print("NB Training Score:",clf_NB.score(X_train,y_train))
    # print("NB F1 Score: ",f1_score(y_test,clf_NB.predict(X_test),zero_division=0))
    # print("NB Accuracy:",accuracy_score(y_test,clf_NB.predict(X_test)))
    return accuracy_score(y_test,clf_NB.predict(X_test))

In [16]:
def mlp():
    clf = MLPClassifier()
    clf.fit(X_train, y_train)
    # print(classification_report(y_test, clf.predict(X_test)))
    # print("MLP Training Score: ",clf.score(X_train,y_train))
    # print("MLP F1 Score:",f1_score(y_test,clf.predict(X_test),zero_division=0))
    # print("MLP Accuracy:",accuracy_score(y_test,clf.predict(X_test)))
    return accuracy_score(y_test,clf.predict(X_test))

In [17]:
def linearSVM():
    clf = SVC(kernel="linear")
    clf.fit(X_train, y_train)
    # print(classification_report(y_test, clf.predict(X_test)))
    # print("Linear SVM Training Score: ",clf.score(X_train,y_train))
    # print("Linear SVM F1 Score:",f1_score(y_test,clf.predict(X_test),zero_division=0))
    # print("Linear Accuracy:",accuracy_score(y_test,clf.predict(X_test)))
    return accuracy_score(y_test,clf.predict(X_test))
    

In [18]:
def randomForest():
    clf = RandomForestClassifier(n_estimators=2000)
    clf.fit(X_train, y_train)
    # print(classification_report(y_test, clf.predict(X_test)))
    # print("RF Training Score: ",clf.score(X_train,y_train))
    # print("RF F1 Score:",f1_score(y_test,clf.predict(X_test),zero_division=0))
    # print("RF Accuracy:",accuracy_score(y_test,clf.predict(X_test)))
    return accuracy_score(y_test,clf.predict(X_test))
    

In [19]:
amazon5 = pd.read_csv("data/amazon5.csv")
tfidf_df = tfIDFvectorization(amazon5)
polsub_df = calculate_polarity_subjectivity(amazon5)
#glove_df=get_glove_embeddings(amazon5)
print(tfidf_df.shape)
print(polsub_df.shape)


(4953, 7145)
(4953, 4)


In [None]:
glove_df

In [None]:
glove_df = pd.Dataframe(glove_df, columns=["Glove"])

In [20]:
feature_df = pd.concat([tfidf_df, polsub_df],axis=1)
#feature_df=pd.concat([feature_df,glove_df],axis=1)
# print(feature_df.shape)


In [80]:


pca = PCA(n_components=4000)
amazon5_tfidf_reduced = pca.fit_transform(feature_df)
labels = amazon5["label"]
X_train, X_test, y_train, y_test = train_test_split(amazon5_tfidf_reduced, labels, test_size=0.2, random_state=42)
# if temp>acc1:
#     acc1=temp
#     n1=n

print(logisticRegression())

print(naiveBayes())

print(mlp())

print(linearSVM())
print(randomForest())

0.5227043390514632
0.49848637739656915
0.5418768920282543
0.5237134207870837


NameError: name 'n_estimatorX_train' is not defined

In [83]:
print(randomForest())

0.5368314833501514


In [87]:
!pip install --upgrade tensorflow 

Collecting tensorflow
  Downloading tensorflow-2.4.1-cp37-cp37m-macosx_10_11_x86_64.whl (173.9 MB)
[K     |████████████████████████████████| 173.9 MB 6.6 MB/s 
Collecting h5py~=2.10.0
  Downloading h5py-2.10.0-cp37-cp37m-macosx_10_6_intel.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 64.0 MB/s 
[?25hCollecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting opt-einsum~=3.3.0
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 16.0 MB/s 
Collecting wrapt~=1.12.1
  Downloading wrapt-1.12.1.tar.gz (27 kB)
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting tensorboard~=2.4
  Downloading tensorboard-2.4.1-py3-none-any.whl (10.6 MB)
[K     |████████████████████████████████| 10.6 MB 20.9 MB/s 
[?25hCollecting astunparse~=1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting grpcio~=1.32.0
  Downloading grpcio-1.32.0-c

In [88]:
!pip install Keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Bidirectional
from keras.optimizers import Adam
from keras.layers import Embedding




ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [89]:
pip install Keras

Note: you may need to restart the kernel to use updated packages.


In [72]:

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Bidirectional
from keras.optimizers import Adam
from keras.layers import Embedding
def LSTM_func():
    # model = Sequential()
    # # e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=1500, trainable=False)
    # # model.add(e)
    # model.add(Bidirectional(LSTM(128, input_shape=(X_train.shape[1], 1), dropout=0.2)))
    # model.add(Dense(2,activation='softmax'))
    # model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01),metrics=['accuracy'])
    # # print(X_train.shape)
    # # print(y_train.reshape())
    # model.fit(X_train, y_train, epochs=30, batch_size=32, verbose=1, validation_split=0.2)
    # score, acc = model.evaluate(trainX, y_train)
    # print('Test score:', score)
    # print('Test accuracy:', acc)
    # testX=np.reshape(X_test, (X_test.shape[0], 1, X_train.shape[1]))
    # score, acc = model.evaluate(testX, y_test)
    # print('Test score:', score)
    # print('Test accuracy:', acc)

    
    # model = Sequential()
    # model.add(LSTM(units=50,return_sequences=True,input_shape=(X_train.shape[1], 1)))
    # model.add(Dropout(0.2))
    # model.add(LSTM(units=50,return_sequences=True))
    # model.add(Dropout(0.2))
    # model.add(LSTM(units=50,return_sequences=True))
    # model.add(Dropout(0.2))
    # model.add(LSTM(units=50))
    # model.add(Dropout(0.2))
    # model.add(Dense(units=1))
    # model.compile(optimizer='adam',loss='mean_squared_error')
    # model.fit(X_train,y_train,epochs=100,batch_size=32)





In [41]:
from sklearn.model_selection import train_test_split

pca = PCA(n_components=4000)
amazon5_tfidf_reduced = pca.fit_transform(feature_df)
labels = amazon5["label"]
trainY = pd.get_dummies(labels.values).values

X_train, X_test, y_train, y_test = train_test_split(amazon5_tfidf_reduced, labels, test_size=0.2, random_state=40)
y_train = y_train.to_numpy()
trainX = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))

In [73]:
LSTM_func()

Epoch 1/30


ValueError: in user code:

    /Users/srihariravi/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /Users/srihariravi/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/srihariravi/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/srihariravi/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/srihariravi/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/srihariravi/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /Users/srihariravi/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:754 train_step
        y_pred = self(x, training=True)
    /Users/srihariravi/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /Users/srihariravi/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:223 assert_input_compatibility
        str(tuple(shape)))

    ValueError: Input 0 of layer sequential_16 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 4000)


In [44]:
trainX.shape

(3962, 1, 4000)