In [None]:
from tqdm.auto import tqdm
import numpy as np
import os, glob, pickle
import pandas as pd
from collections import Counter
import re, string
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import seaborn as sns
from collections import defaultdict, OrderedDict
import utils.score as evaluate

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

import warnings
warnings.simplefilter("ignore")

In [None]:
!ls data/3a/

### preprocessing

In [None]:
def remove_punctuation(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 

def prepr(txt):
    dummy = []
    for i in txt.split():
        if '@' in i: dummy.append('@someuser')
        elif '#' in  i: dummy.append('#someuser')
        elif i in stopwords: continue
        else: dummy.append(remove_punctuation(i))
    
    return dummy

### Naive bayes

In [None]:
class naive_bayes_():
    def __init__(self,train, test):
        self.train_x = [prepr(i.lower()) for i in tqdm(train['x'])]
        self.train_y = [i for i in tqdm(train['y'])]
        
        if test:
            self.test_x = [prepr(i.lower()) for i in tqdm(test['x'])]
            self.test_y = [i for i in tqdm(test['y'])]

            self.data_x, self.data_y = self.train_x + self.test_x, self.train_y + self.test_y
        
        else: 
            self.test_x = None
            self.data_x = self.train_x
            self.data_y = self.train_y 
        
        self.vocab = sorted([i for i in set([j for i in tqdm(self.data_x) for j in i])])
        
    def fit(self):
        
        dummy = Counter(self.train_y)
        prior = {i:dummy[i]/len(self.train_y) for i in set(dummy)}
        self.labels = prior.keys()

        matrix = {i:np.zeros(len(self.vocab)) for i in self.labels}
        self.vti = {i:d for d,i in enumerate(self.vocab)}
        
        dummy = {i:len(self.vocab) for i in self.labels}
        for i in tqdm(range(len(self.train_y))):
            count = Counter(self.train_x[i])
            dummy[self.train_y[i]] += sum(count.values())
            
            for k,v in count.items(): 
                matrix[self.train_y[i]][self.vti[k]]+= v
        
        for k,v in matrix.items():
            matrix[k] += 1
            matrix[k]/= dummy[k]
        
        self.prior = prior
        self.matrix = matrix
        self.dummy = dummy
        
        return prior, matrix
        
    def predict(self,test_x,pred=True):
        
        test_x = [' '.join(prepr(i.lower())) for i in test_x]
        pred_y = []

        for i in tqdm(test_x):
            label = {}
            for j in self.labels:
                prob = self.prior[j]
                for w in i.split():
                    if w in self.vti.keys(): prob *= self.matrix[j][self.vti[w]]
                    else: prob *= 1/(len(self.vocab) + self.dummy[j])
                label[j] = prob
#             pred_y.append(label)  
            a = sum(label.values())
            pred_y.append({k:v/a for k,v in label.items()})
        if pred: return [sorted(i.items(), key=lambda x:x[-1], reverse=True)[0][0] for i in pred_y]
        else: return pred_y


.

# 3A

In [None]:
# Dataset loading

train_3a = pd.read_csv('data/3a/sentiment_train.csv') # 11.2 lakhs
test_3a = pd.read_csv('data/3a/sentiment_test.csv') # 4.8 lakhs

dummy = False # Dummy data for testing

if not dummy:
    train = {
                'x': train_3a['5'],
                'y': train_3a['0']
    }

    test = {
                'x': test_3a['5'],
                'y': [i for i in test_3a['0']]
    }

else:
    train = {
                'x': ['chinese beijing chinese', 'chinese chinese shanghai', 'chinese macao','tokyo japan chinese'],
                'y': [1,1,1,2]
    }

    test = {
                'x': ['chinese chinese chinese tokyo japan', 'fuck this shit japan japan tokyo'],
                'y': [1,2]
    }

In [None]:
naive_bayes = naive_bayes_(train,None)
prior, matrix = naive_bayes.fit()

pred = naive_bayes.predict(test['x'], True)
assert len(pred) == len(test['y'])

# Evaluation

In [None]:
metric = evaluate.evaluate(test['y'], pred, naive_bayes.labels)
metric

In [None]:
cm = metrics.confusion_matrix(test['y'], pred)
sns.heatmap(cm, annot=True)

# 3B

In [None]:
# Dataset loading

header_list = ['id', 'tweet', 'emo', 'intent']

df_train = pd.read_table('data/3b/train/out.txt', names=header_list)
train_x = [i for i in df_train['tweet']]
train_y = [i for i in df_train['emo']]

df_test = pd.read_table('data/3b/test/out.txt', names=header_list)
test_x = [i for i in df_test['tweet']]
test_y = [i for i in df_test['emo']]

train = {
                'x': train_x,
                'y': train_y
}

test = {
            'x': test_x,
            'y': [i for i in test_y]
}

In [None]:
naive_bayes = naive_bayes_(train,None)
prior, matrix = naive_bayes.fit()

pred = naive_bayes.predict(test['x'], True)
assert len(pred) == len(test['y'])

# Evaluation

In [None]:
metric = evaluate.evaluate(test['y'], pred, naive_bayes.labels)
metric

In [None]:
@ MIDAS Lab + Precog Lab
Indraprastha Institute of Information Technology Delhi (IIITD)
(An autonomous institute established by the Government of NCT Delhi)
5 Attachments
 
 
Preview attachment AFINN-emoticon-8.txt
￼
￼
AFINN-emoticon-8.txt
Preview attachment Emoji.csv
￼￼
Emoji.csv
Preview attachment faces.txt
￼
￼
faces.txt
Preview attachment AFINN-README.txt
￼
￼
AFINN-README.txt
Preview attachment AFINN-111.txt
￼
￼
AFINN-111.txt
cm = metrics.confusion_matrix(test['y'], pred)
sns.heatmap(cm, annot=True)

## Pearson correlation

In [None]:
pred = naive_bayes.predict(test['x'], False)
pred = [sorted(i.items(), key=lambda x:x[-1], reverse=True)[0] for i in pred]

In [None]:
pred[:3]

In [None]:
t_a, p_a = '', ''
for i in range(len(test_x)):
    t_a += f"{i}\t{test_x[i]}\t{df_test['emo'][i]}\t{df_test['intent'][i]}\n"
    p_a += f"{i}\t{test_x[i]}\t{pred[i][0]}\t{pred[i][1]}\n"
#     print(t_a,p_a)
#     break

In [None]:
def save_(path, file):
    with open(path, "w") as f:
        f.write(file)

In [None]:
save_('true.txt', t_a)
save_('pred.txt', p_a)

In [None]:
!python2 eval.py 1 pred.txt true.txt

.

.

.

# Feature implementation

In [24]:
from scipy import sparse
def load_npz(name):
    return sparse.load_npz("data/3a/npz/"+str(name)+".npz")

uva = load_npz("bow_counts_cv")
ute = load_npz("bow_counts_test")
utr = load_npz( "bow_counts_train")
btr = load_npz( "bigram_counts_train")
bte = load_npz( "bigram_counts_test")
bva = load_npz( "bigram_counts_cv")

# train_df.to_csv('data/3a/gram_traincsv.csv')
# val_df.to_csv('data/3a/gram_valcsv.csv')
# test_df.to_csv('data/3a/gram_testcsv.csv')

In [26]:
uva[0]

<1x253751 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [None]:
train = pd.concat([pd.read_csv("data/3a/train_df.csv"),pd.read_csv("data/3a/val_df.csv")],axis=0)
val = pd.read_csv("data/3a/test_df.csv")
print(f"finish loading dataset")
axis = 1
print(train.shape)
train.dropna(axis=axis, inplace=True)
print(train.shape)
print(val.shape)
val.dropna(axis=axis, inplace=True)
print(val.shape)

In [None]:
train.drop(["tweets","Unnamed: 0"], axis = 1, inplace = True) 
val.drop(["tweets","Unnamed: 0"], axis = 1, inplace = True) 
[i for i in train.columns], [i for i in val.columns]

In [None]:
y_train = train['target']
train.drop(['target'], axis=1, inplace=True)
X_train = train

y_test = val['target']
val.drop(['target'], axis=1, inplace=True)
X_test = val

In [None]:
X_train.head()#, Counter(X_train['punctuation'])

In [None]:
X_test.head()#, Counter(X_test['punctuation'])

.

# Decions tree

In [None]:
import numpy as np
from sklearn.decomposition import PCA

# energy = 0
# for i in tqdm(range(1,768)):
#     pca = PCA(n_components=i)
#     pca.fit(X_train)
#     energy = sum(pca.explained_variance_ratio_)
#     print(energy)
#     if energy >=0.9999:
#         print(i)
#         break
# print(energy)

i=16        
pca = PCA(n_components=i)
X = pca.fit_transform(X_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# DT Classifier
dt = DecisionTreeClassifier()
# Lets fit the data into classifier 
dt.fit(X, y_train)

# predict on test data
y_pred = dt.predict(pca.transform(X_test))
evaluate.evaluate(y_test,y_pred,set(y_test))

# sklearn

In [None]:
from sklearn.svm import SVC

# SVC Classifier
clf_SVC = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, 
          probability=False, tol=0.001, cache_size=200, class_weight=None, 
          verbose=1, max_iter=-1, decision_function_shape="ovr", random_state = 0)

# Fitting training data
clf_SVC.fit(X,y_train)

# predicting accuracies
print('Accuracy of SVC on training set: {:.2f}'.format(clf_SVC.score(X_train, y_train) * 100))

# predictions
y_pred = clf_SVC.predict(pca.transform(X_test))
print('Test')
evaluate.evaluate(y_test,y_pred,set(y_test))

# MLP

In [None]:
from sklearn.neural_network import MLPClassifier # neural network

# Classifier
clf = MLPClassifier(alpha=1e-1, n_jobs=-1, hidden_layer_sizes=(3, 3), random_state=1, verbose=True, max_iter=20)
# print(clf.get_params())

#Fiting trainging data
clf.fit(X, y_train)

#predicting the data
y_pred = clf.predict(pca.transform(X_test))
print('Test')
evaluate.evaluate(y_test,y_pred,set(y_test))