In [13]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


!pip install pysentiment2

import pysentiment2 as ps 



In [14]:
def calculate_polarity_subjectivity(df):
    lm = ps.LM()
    hiv4 = ps.HIV4()
    polarity_array = []
    subjectivity_array = []
    hiv_polarity = []
    hiv_subjectivity = []
    count=0
    count1=0
    for x in range(len(df['filteredtext'])):
        tokens_m = lm.tokenize(df['filteredtext'][x])
        score_m = lm.get_score(tokens_m)
        polarity_array.append(score_m['Polarity'])
        subjectivity_array.append(score_m['Subjectivity'])
        tokens_hiv = hiv4.tokenize(df['filteredtext'][x])
        score_hiv = hiv4.get_score(tokens_hiv)
        hiv_polarity.append(score_hiv['Polarity'])
        hiv_subjectivity.append(score_hiv['Subjectivity'])
        if score_m['Polarity']*score_hiv['Polarity']<0:
            count+=1
    feature_df = pd.DataFrame()
    feature_df['Mcdonald_Polarity'] = polarity_array
    feature_df['Mcdonald_Subjectivity'] = subjectivity_array
    feature_df['HIV_Polarity'] = hiv_polarity
    feature_df['HIV_Subjectivity'] = hiv_subjectivity
    return feature_df

In [15]:
def get_glove_embeddings(df):
        word_list = []
        for i in df['filteredtext']:
            x = i[1:-1].split(", ")
            words = []
            for j in x:
                s = j.split(" ")
                for k in s:
                    words.append(k)
            word_list.append(words)
        filename = './data/glove.6B.100d.txt.word2vec'
        model = KeyedVectors.load_word2vec_format(filename, binary=False)


        embedding_list = []
        for i in word_list:
            embeddings = []
            for j in i:
                try:
                    glov = model[j]
                    embeddings.append(glov)
                except:
                    continue
            embedding_list.append(embeddings)
        return embedding_list

In [16]:
def tfIDFvectorization(input_df):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(list(input_df["filteredtext"]))
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    normalize(df)
    return df

In [17]:
def logisticRegression():
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    # print(classification_report(y_test, clf.predict(X_test)))
    # print("LR Training Score: ",clf.score(X_train,y_train))
    # print("LR F1 Score:",f1_score(y_test,clf.predict(X_test),zero_division=0))
    # print("LR Accuracy:",accuracy_score(y_test,clf.predict(X_test)))
    return accuracy_score(y_test,clf.predict(X_test))

In [18]:
def naiveBayes():
    clf_NB = GaussianNB()
    clf_NB.fit(X_train,y_train)
    # print(classification_report(y_test, clf_NB.predict(X_test)))
    # print("NB Training Score:",clf_NB.score(X_train,y_train))
    # print("NB F1 Score: ",f1_score(y_test,clf_NB.predict(X_test),zero_division=0))
    # print("NB Accuracy:",accuracy_score(y_test,clf_NB.predict(X_test)))
    return accuracy_score(y_test,clf_NB.predict(X_test))

In [19]:
def mlp():
    clf = MLPClassifier()
    clf.fit(X_train, y_train)
    # print(classification_report(y_test, clf.predict(X_test)))
    # print("MLP Training Score: ",clf.score(X_train,y_train))
    # print("MLP F1 Score:",f1_score(y_test,clf.predict(X_test),zero_division=0))
    # print("MLP Accuracy:",accuracy_score(y_test,clf.predict(X_test)))
    return accuracy_score(y_test,clf.predict(X_test))

In [20]:
def linearSVM():
    clf = SVC(kernel="linear")
    clf.fit(X_train, y_train)
    # print(classification_report(y_test, clf.predict(X_test)))
    # print("Linear SVM Training Score: ",clf.score(X_train,y_train))
    # print("Linear SVM F1 Score:",f1_score(y_test,clf.predict(X_test),zero_division=0))
    # print("Linear Accuracy:",accuracy_score(y_test,clf.predict(X_test)))
    return accuracy_score(y_test,clf.predict(X_test))
    

In [21]:
def randomForest():
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    # print(classification_report(y_test, clf.predict(X_test)))
    # print("RF Training Score: ",clf.score(X_train,y_train))
    # print("RF F1 Score:",f1_score(y_test,clf.predict(X_test),zero_division=0))
    # print("RF Accuracy:",accuracy_score(y_test,clf.predict(X_test)))
    return accuracy_score(y_test,clf.predict(X_test))
    

In [22]:
amazon5 = pd.read_csv("data/amazon5.csv")
tfidf_df = tfIDFvectorization(amazon5)
polsub_df = calculate_polarity_subjectivity(amazon5)
# glove_df=get_glove_embeddings(amazon5)
print(tfidf_df.shape)
print(polsub_df.shape)


(4232, 6092)
(4232, 4)


In [23]:
feature_df = pd.concat([tfidf_df, polsub_df],axis=1)
# feature_df=pd.concat([feature_df,glove_df],axis=1)
print(feature_df.shape)


(4232, 6096)


In [25]:
n=4232
acc1=-1
acc2=-1
acc3=-1
acc4=-1
acc5=-1


n1=-1
n2=-1
n3=-1
n4=-1
n5=-1

while n>0:
    pca = PCA(n_components=n)
    amazon5_tfidf_reduced = pca.fit_transform(feature_df)
    labels = amazon5["label"]
    X_train, X_test, y_train, y_test = train_test_split(amazon5_tfidf_reduced, labels, test_size=0.2, random_state=42)
    temp=logisticRegression()
    if temp>acc1:
        acc1=temp
        n1=n

    print()
    temp=naiveBayes()
    if temp>acc2:
        acc2=temp
        n2=n
    print()
    temp=mlp()
    if temp>acc3:
        acc3=temp
        n3=n
    print()
    temp=linearSVM()
    if temp>acc4:
        acc4=temp
        n4=n
    print()
    temp=randomForest()
    if temp>acc5:
        acc5=temp
        n5=n
    n-=1
    print(n)


ValueError: n_components=6096 must be between 0 and min(n_samples, n_features)=4232 with svd_solver='full'