####IMPORT

In [2]:
import ntpath
from os import walk, path
from pathlib import Path
import os
import sys
from tqdm import tqdm


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# PREPROCESSING related imports

from random import shuffle
import re    
import string  
import pandas as pd 
import numpy as np 

import nltk
from nltk.corpus import twitter_samples                          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer   
from nltk.corpus import stopwords          

from sklearn.model_selection import train_test_split



In [3]:
dir = '.'
data_p = "/content/"
train_p = data_p + 'Dataset/train/pos/' 
train_n = data_p + 'Dataset/train/neg/'

def download_data():
  !gdown --id 1pwbVIT5yyUbQZTKp6Fy3gir9eOsUj3nB --output "{dir}/data.zip"

def unzip_data():
  f = dir+"/data.zip"
  !mkdir "$data_p"
  !unzip "$f" -d "$data_p"


In [4]:
def getF(directory, x="both"):
    # x could be "files" or "folders" or "both"

    folders = []
    for f in walk(directory):
        folders.extend(f)

    if x == "folders":

      return folders[1]
      N
      
        
    if x == "files":
        return folders[2]
    else:
      return folders[1], folders[2]



# read all text files

def read_Alltxt(read_path):
  files = getF(read_path,'files')
  txtFiles = []
  for f in files:
    with open(read_path+f, 'r', encoding="utf-8") as of:
      data = ''
      for line in of:
        data += line
      txtFiles.append(data)
  return txtFiles
    



def BOW(f_as_ftokens, labels):
    bow = {}

    for tokens, label in list(zip(f_as_ftokens, labels)):
        for token in tokens:
            bow[(token, label)] = bow.get((token, label), 0) + 1
        
    return bow



# feature vectors
def extract_features(f_as_ftokens, bow):
    # feature array
    features = np.zeros((1,3))
    # bias term added in the 0th index
    features[0,0] = 1
    
    # iterate processed_tweet
    for word in f_as_ftokens:
        # get the positive frequency of the word
        features[0,1] = bow.get((word, 1), 0)
        # get the negative frequency of the word
        features[0,2] = bow.get((word, 0), 0)
    
    return features

In [None]:
download_data()
unzip_data()

In [6]:
data_p = read_Alltxt(train_p)

data_n = read_Alltxt(train_n)

In [7]:
class Preprocessor():
    def __init__(self):
      self.tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                                      reduce_len=True)
      nltk.download('stopwords')
      self.stopwords_en = stopwords.words('english') 
      self.punctuation_en = string.punctuation
      self.stemmer = PorterStemmer() 

    def __clean_n_tokenize_text__(self, txt):

        # remove hyperlinks
        txt = re.sub(r'https?:\/\/.*[\r\n]*', '', txt)
        # remove hashtags
        txt = re.sub(r'#', '', txt)
        #remove email address
        txt = re.sub('\S+@\S+', '', txt)
        # remove numbers
        txt = re.sub(r'\d+', '', txt)
        txt =  self.tokenizer.tokenize(txt)
        return txt


    def __filter_stopwords__(self, tokens):
        # remove stopwords
        filtered_tokens = []

        for word in tokens:
            if (word not in self.stopwords_en and  # remove stopwords
                word not in self.punctuation_en):  # remove punctuation
                filtered_tokens.append(word)
        return filtered_tokens

    def __stem_tokens__(self, tokens):
        # store the stemmed word
        stemmed_tokens = [] 

        for word in tokens:
            stem_word = self.stemmer.stem(word)  
            stemmed_tokens.append(stem_word)
        return stemmed_tokens



    def preprocess(self, txts):
        final_tokens = []
        for _, txt in tqdm(enumerate(txts)):        
            txt = self.__clean_n_tokenize_text__(txt)                       
            txt = self.__filter_stopwords__(txt)
            txt = self.__stem_tokens__(txt)
            final_tokens.extend([txt])
        return final_tokens

In [8]:
processor = Preprocessor()

# process the positive and negative tweets
pfs_as_ftokens = processor.preprocess(data_p)
nfs_as_ftokens = processor.preprocess(data_n)

24it [00:00, 228.00it/s]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


12500it [00:52, 236.97it/s]
12500it [00:52, 237.64it/s]


In [9]:
# Labels

labels = [1 for i in range(len(pfs_as_ftokens))]
labels.extend([0 for i in range(len(nfs_as_ftokens))])
pos_neg_fs_as_ftoken = pfs_as_ftokens + nfs_as_ftokens

In [10]:
bow = BOW(pos_neg_fs_as_ftoken, labels)
df = pd.DataFrame(list(zip(pos_neg_fs_as_ftoken, labels)), columns=["f_as_tokens", "labels"])

In [11]:
train_Xt, test_Xt, train_Y, test_Y = train_test_split(df["f_as_tokens"], df["labels"], test_size = 0.2, shuffle=True)


In [12]:
train_X = np.zeros((len(train_Xt), 3))

for index, f_as_token in enumerate(train_Xt):
    train_X[index, :] = extract_features(f_as_token, bow)

# test X feature dimension
test_X = np.zeros((len(test_Xt), 3))

for index, f_as_token in enumerate(test_Xt):
    test_X[index, :] = extract_features(f_as_token, bow)

In [13]:


classifier = LogisticRegression(random_state=0)
classifier.fit(train_X, train_Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
predictions = classifier.predict(test_X)

In [15]:


print(confusion_matrix(test_Y,predictions))


[[1293 1181]
 [ 747 1779]]


In [16]:
print(classification_report(test_Y,predictions))


              precision    recall  f1-score   support

           0       0.63      0.52      0.57      2474
           1       0.60      0.70      0.65      2526

    accuracy                           0.61      5000
   macro avg       0.62      0.61      0.61      5000
weighted avg       0.62      0.61      0.61      5000



In [17]:
print(accuracy_score(test_Y, predictions))

0.6144
