In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sprs
import re
import string
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import accuracy_score, f1_score

## 1 Introduction to Natural Language Processing

In [73]:
#reading the dataset
df = pd.read_csv("IMDB Dataset.csv")

In [74]:
#function to remove html tags
def remove_html(data):
    return re.sub(r'<.*?>',' ', data)

#function to remove url
def remove_url(data):
    return re.sub(r'https?://\S+', ' ', data)

#remove punctuation
def remove_punc(data):
    return re.sub('\W+','', data)

#main function
stop_words = stopwords.words('english')
def preprocess_text(data):
    clear_data = remove_html(remove_url(data.lower())).split()
    #this remove emails
    text = (' '.join([remove_punc(w) for w in clear_data if not w in stop_words and "@" not in w]))
    return text.strip()

In [75]:
#cleaning the reviews from html tags, urls, stopwords and punctutation
df['review'] = df['review'].apply(preprocess_text)

In [83]:
class NLPModels:
    def __init__(self, col):
        self.col = col #column in which text data is stored
        
    def fit(self):
        self.count_words_across_document = {} #this is used to track the number of comments which contain a particular word
        self.list_words_sentences = [] #this list tracks word counts of each comment
        for sent in self.col: #iterating over each comment
            count_in_sentence = {} #this tracks the word seen in comment so far and its count
            for word in sent.split(): #iterating over each word in a comment
                
                #if word is already in dictionary increase the count else make a new key
                if word in self.count_words_across_document:
                    self.count_words_across_document[word]+=1
                    
                elif word not in self.count_words_across_document:
                    self.count_words_across_document[word]=1
                
                #this tracks word in each comment only
                if word in count_in_sentence:
                    count_in_sentence[word]+=1

                elif word not in count_in_sentence:
                    count_in_sentence[word]=1
            
            #make a list containing all the words across all comments
            self.list_words_sentences.append(count_in_sentence)
            
        #to make a two way connection between going from words to index or vice versa
        self.idx_to_word= {i:w for i,w in enumerate(self.count_words_across_document)}
        self.word_to_idx = {w:i for i,w in enumerate(self.count_words_across_document)}
        self.vocab_size = len(self.count_words_across_document) #number of unique words
            
    def BOW(self):
        bow = sprs.lil_matrix((1,self.vocab_size)) #sparse matrix as they are more efficent
        for sent in self.list_words_sentences: #iterate over each comment words dictionary
            rep = sprs.lil_matrix((1,self.vocab_size)) #empty matrix to store the final representation of the word
            for word, count in sent.items(): #iterating ober each word
                rep[0,self.word_to_idx[word]]=count #replacing the count in the representation matrix of that word
            bow = sprs.vstack([bow,rep]) #concatenate the matrix vertically to obtain one big matrix of size mxd
        return bow.toarray()[1:,:] 
    
    def TF_IDF(self):
        tf_idf = sprs.lil_matrix((1,self.vocab_size)) #sparse matrix as they are more efficent
        for sent in self.list_words_sentences: #iterate over each comment words dictionary
            rep = sprs.lil_matrix((1,self.vocab_size)) #empty matrix to store the final representation of the word
            for word, count in sent.items(): #iterating ober each word
                #term frequency-inverse document frequency
                rep[0,self.word_to_idx[word]]=count/len(sent) * np.log(len(self.col)/self.count_words_across_document[word])
            tf_idf = sprs.vstack([tf_idf,rep]) #concatenate the matrix vertically to obtain one big matrix of size mxd
        return tf_idf.toarray()[1:,:]

In [87]:
n = NLPModels(df['review'].iloc[:100]) 
#fit the model
n.fit()
#obtain the BOW representation
bow_representation = n.BOW()
print(bow_representation)

[[1. 1. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]]


In [86]:
#obtain the tfidf representation
tf_idf_representation = n.TF_IDF()
print(tf_idf_representation)

[[-0.00014246  0.03313072  0.02315738 ...  0.          0.
   0.        ]
 [-0.00024753  0.          0.         ...  0.          0.
   0.        ]
 [-0.00023859  0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.00030004  0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.02436598  0.02436598
   0.02436598]]


## 2 Support Vector Machines

In [2]:
#reading the data
df = pd.read_csv("creditcard.csv")
#separating into X and Y
X = df.iloc[:,:-1]
y=df.iloc[:,-1]
y = df.iloc[:,-1].values
#make the y labels as -1,1 instead of 0,1
y = np.where(y>0,y,-1)

In [3]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [35]:
over_samp = RandomOverSampler(random_state=42) #oversampling
under_samp = TomekLinks() #undersampling
smote = SMOTE(random_state=101) #SMOTE

In [20]:
class Pegasos:
    def __init__ (self, lamda, k, projection):
        self.lamda = lamda #lambda value
        self.k = k #number of observations to be used
        self.projection = projection #for projections
        
    def gradient(self, p): #to calculate the gradient
        return np.where(p<1,1,0)
    
    def fit(self, x, y, n_iters=1500):
        m, n = x.shape
        self.W = np.zeros(n)
        for t in range(1,n_iters+1): #iterate till max_iters
            #pick a random instance
            idx = np.random.choice(range(m), self.k, replace=False)
            lr = 1/(self.lamda*t) #get the learning rate
            x_i = x[idx] #get x_i
            y_i = y[idx] #get y_i
            prod = y_i * (x_i@self.W) #obtain the product
            #update the weights
            self.W = (1-lr*self.lamda)*self.W + \
            (lr/self.k)*(np.sum(np.multiply(y_i.reshape(-1,1),x_i)*self.gradient(prod).reshape(-1,1),axis=0))
    
    def predict(self, x):
        #transform the inputs using the weight vector
        p = x@self.W.reshape(-1,1)
        return np.sign(p) #the sign function outputs the class

In [21]:
class SVMDC:
    def __init__ (self, C, mode = "L1", tol=1e-3):
        self.C = C #C value
        self.mode = mode
        self.tol = tol #tolerance value to break out of the loop
    
    def partial_gradient(self,G,a,U): #to calculate the partial gradient
        if a == 0:
            return min(G,0)
        elif a == U:
            return max(G,0)
        elif (a>0) and (a<U):
            return G
    
    def fit(self, X, y,iters=100):
        m, n = X.shape
        self.w = 0 #weight matrix
        
        #SVMDC can be done in L1 and L2 modes
        if self.mode == "L1":
            Dii = 0
            U=self.C
        else:
            Dii = 1/(2*self.C)
            U=np.inf
        
        #to get the langrangian multipliers
        alpha = np.zeros(m)
        self.w = np.zeros(shape=(n)) #initialize the weight matrix
        Qii = np.sum(X**2, 1) + Dii #calculate Qii
        for t in range(iters): #iterate till max_iters
            err = 0 #calculate error to break the loop
            for i in range(m): #iterate over each instance
                Qhat = Qii[i] #get Q_bar
                G = np.multiply(np.dot(self.w,X[i,:]),y[i]) - 1 + Dii * alpha[i] #gradient of the objective function
                PG = self.partial_gradient(G,alpha[i],U) #partial gradient of the objective function
                if np.abs(G) > err: #to keep updating the error term
                    err = np.abs(G)
                
                #to find optimal solution
                if np.abs(G) > 0: 
                    alpha_new = min(max(alpha[i]-G/Qhat,0),U)
                    self.w = self.w+(np.multiply((alpha_new - alpha[i])* y[i] ,X[i,:]))
                    alpha[i] = alpha_new
            
            #stop iterating once the error fall below tolerance        
            if err<self.tol:
                break
        
    def predict(self, x):
        #project the points using the weight matrix
        p = x@self.w.reshape(-1,1)
        return np.sign(p) #the sign function tells the class which the object belong to

Now Experiment with different sampling techniques on these algorithms

In [25]:
sampler = [over_samp,under_samp,smote]
peg_single = Pegasos(0.01,1,projection=False)
peg_batch = Pegasos(0.01,10,projection=False)
svm = SVMDC(0.1)
algo = [peg_single, peg_batch, svm]

In [26]:
entry = []
for samp in sampler:
    X_train_ov,y_train_ov = samp.fit_resample(X_train,y_train)
    model_accuracy = []
    for model in algo:
        model.fit(X_train_ov.values,y_train_ov)
        model_accuracy.append(accuracy_score(y_test,model.predict(X_test)))
    entry.append(model_accuracy)

In [30]:
results = pd.DataFrame(entry,index=['Pegasos','Pegasos Mini Batch','SVMDC'],columns=['Oversampler','Undersampler','SMOTE'])

In [39]:
results

Unnamed: 0,Oversampler,Undersampler,SMOTE
Pegasos,0.998192,0.998227,0.997928
Pegasos Mini Batch,0.99828,0.99828,0.998244
SVMDC,0.00172,0.998262,0.006144


**Conclusion:** As we can infer from the results that the accuracies are highly volatile. The random state provided in the sampler highly influences the accuracy of the model on the test set. It was also found that for SVMDC only undersampling gave good results, different random state was also experimented here but no good was found. Among different models, the results were found to be more or less similar, however pegasos mini batch was found to be performing slightly better than the rest on all sampler. In the end it is also recommended to prefer undersampling as it keeps the size of the data manageable and avoid long training times.

## 3. Optuna - A hyperparameter optimization framework

Done on Colab

References:
1. https://medium.com/@jorlugaqui/how-to-strip-html-tags-from-a-string-in-python-7cb81a2bbf44
2. https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
3. https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
4. https://github.com/samehkhamis/SVMDC
5. https://github.com/karndeepsingh/optuna