In [23]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from imblearn.over_sampling import SMOTE,RandomOverSampler
from imblearn.combine import SMOTEENN,SMOTETomek

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [24]:
#Loading Training dataset
with open("/Users/sanjanagovindu/Downloads/DMAsst2/train_file.csv", "r") as trainData:
    train = trainData.readlines()

#Loading Test dataset
with open("/Users/sanjanagovindu/Downloads/DMAsst2/test_file.csv", "r") as testData:
    test = testData.readlines()

In [25]:
train_list = []
train_label = []

In [26]:
#Convert the given training and test data into matrix
def convert_matrix_split(data):
    feat_range = 100000
    sp_matrix = pd.DataFrame(columns=range(feat_range))
    l = len(data)
    for i in range(l):
        xarr = [0 for j in range(feat_range)]
        for k in np.fromstring(data[i], dtype=int, sep=' '):
            xarr[k-1] = 1
        sp_matrix.loc[i] = xarr
    return sp_matrix

In [27]:
for data in train:
    train_label.append(data[0])
    
    #Remove new line and activity label - 0/1 from each row
    data = data.replace("\n", "")
    data = data.replace("0\t", "")
    data = data.replace("1\t", "")
    train_list.append(data)

In [28]:
train_data = convert_matrix_split(train_list)
test_data = convert_matrix_split(test)
y_train = np.asarray(train_label) #convert the input train_label into an array

In [29]:
def reduceDimentionality(train_data, test_data):
    #Applying PCA - Principal Component Analysis to reduce dimentionality
    #red_dim = PCA(n_components=1000)
    
    #Applying SVD -  Singular Value Decomposition to reduce dimentionality
    #svd = TruncatedSVD(n_components=1000)
    red_dim = TruncatedSVD(n_components=500, random_state=42)

    train_vector = red_dim.fit_transform(train_data)
    test_vector = red_dim.transform(test_data)
    return train_vector, test_vector

In [30]:
def performClassification(train_data, test_data, y_train):
    pred_values = []

    #Naive Bayes - Bernoulli's Classification
    print("Performing - Naive Bayes - Bernoulli's Classification")
    bnb = BernoulliNB().fit(train_data, y_train)
    #bnb.fit(train_data, y_train)
    pred_values = bnb.predict(test_data)
    
    #Naive Bayes - Guassian Classification
    #print("Naive Bayes - Guassian Classification")
    #gnb = GaussianNB().fit(train_data, y_train)
    #pred_values = gnb.predict(test_data)
    
    #Decision Tree Classification
    #print("Decision Tree Classification")
    #dtree = DecisionTreeClassifier().fit(train_data, y_train)
    #pred_values = dtree.predict(test_data)
    
    #Neural Networks - Multilayer Perceptron Classification 
    #print("Multilayer Perceptron Classification")
    #m_percept = MLPClassifier(max_iter=300, activation='relu', solver='adam', hidden_layer_sizes=(5,5,5))
    #m_percept.fit(train_data, y_train)
    #pred_values = m_percept.predict(test_data)

    return pred_values

In [31]:
#0.7 of training data and 0.3 of the test data
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=r) 

In [32]:
#Dimentionality Reduction using SVD technique
train_vector, test_vector = reduceDimentionality(train_data, test_data)

In [None]:
#Dealing with imbalanced data using SMOTE, SMOTEENN, SMOTE Tomek and Random Over Sampler
# smote = SMOTE(random_state = 42)
# train_vector, y_train = smote.fit_resample(train_vector, y_train)

# smoteTomek = SMOTETomek(random_state=42)
# train_vector, y_train = smoteTomek.fit_resample(train_vector, y_train)

# smoteenn = SMOTEENN(random_state=42)
# train_vector, y_train = smoteenn.fit_resample(train_vector, y_train)

# randomOverSampler = RandomOverSampler(random_state=42)
# train_vector, y_train =ros.fit_resample(train_vector, y_train)

In [34]:
#Classification method application for the training and test matrices after dimentionality reduction
predictions = performClassification(train_vector, test_vector, y_train)

Performing - Naive Bayes - Bernoulli's Classification


In [35]:
#Output file with predictions for F1-score calculation
out = open('/Users/sanjanagovindu/Downloads/DMAsst2/output.csv', 'w')

out.writelines( "%s\n" % x for x in predictions)

out.close()

In [36]:
print(len(predictions)) #No of rows in output file - 350 rows

350
