##### import statements

In [1]:
from classes.Classifiers.Classifiers import Classifiers
from classes.Classifiers.ClassBalancer import ClassBalancer
from classes.Classifiers.FeatureSelector import FeatureSelector
from classes.Classifiers.ModelComparison import ModelComparison
from classes.Utils.PandasUtils import PandasUtils

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#numerical
import numpy as np
import pandas as pd
import math
from glob import glob 

#os related operations
import os 
from os import listdir
from os.path import isfile, join

#data structures
from collections import Counter
from itertools import chain
from itertools import islice

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
sns.set(style="darkgrid")

import warnings
warnings.filterwarnings('ignore')


In [3]:
#sklearn
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.feature_selection import SelectKBest, chi2,f_classif 

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve  

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC,SVC #multiclass
from sklearn.linear_model import LogisticRegression #multiclass
from sklearn.ensemble import RandomForestClassifier #multilabel, multioutput
from sklearn.tree import DecisionTreeClassifier  #multilabel, multioutput
from sklearn.ensemble import GradientBoostingClassifier #multiclass

#### set path

In [4]:
path = os.getcwd()
results_path = os.path.join(path,'results','single-label-classification','')
features_path = os.path.join(path,'features','')

In [5]:
if os.path.exists(results_path):
    pass
else:
    os.mkdir(results_path)

In [6]:
# results are stored in a dict
acc_default = {}
f1score_default = {}

In [7]:
# ['code-stat_','code-extend-proper_','all-features_','_no-code','code_','code-comment_']
content = 'code'
setting = '_'+content+'_'+'scaled'
settings = [m+setting for m in ['rf','dt','gb','lsvc','svc','log'] ]

for model_name in settings:
    acc_default[model_name] = {}
    f1score_default[model_name] = {}

### Load Data

In [8]:
test_features = pd.read_pickle(features_path+'test_features.pkl')
train_features = pd.read_pickle(features_path+'train_features.pkl')
validation_features = pd.read_pickle(features_path+'validation_features.pkl')
#print(train_features.isna().sum())
train_features.index = range(train_features.shape[0])
validation_features.index = range(validation_features.shape[0])
test_features.index = range(test_features.shape[0])

train_features.fillna(0,inplace=True)
test_features.fillna(0,inplace=True)
validation_features.fillna(0,inplace=True)
print(train_features.isna().sum())
print(train_features.shape)
print(validation_features.shape)
print(test_features.shape)

filename                0
cell_type               0
cell_number             0
execution_count         0
linesofcomment          0
linesofcode             0
variable_count          0
function_count          0
text/plain              0
image/png               0
text/html               0
execute_result          0
display_data            0
stream                  0
error                   0
text                    0
comment                 0
code_line_before        0
code_line_after         0
markdown_heading        0
packages_info           0
primary_label           0
helper_functions        0
load_data               0
data_exploration        0
data_preprocessing      0
evaluation              0
modelling               0
prediction              0
result_visualization    0
save_results            0
comment_only            0
dtype: int64
(5833, 32)
(1927, 32)
(1918, 32)


#### labels

In [9]:
labels = ['helper_functions','load_data',
              'data_preprocessing','data_exploration',
              'modelling','evaluation','prediction',
              'result_visualization','save_results',
              'comment_only']

#### training + validation dataset!

In [10]:
df = pd.DataFrame()
#df= df.add(features)
#df = df.add(train_features)
#df = df.add(validation_features)
#df.shape
df = pd.concat([df, train_features,  validation_features], axis=0)

# Reset index if needed
df = df.reset_index(drop=True)

#print(df)

In [11]:
print("Total notebook in training+validation: ",len(set(df.filename.values)))
print("Total notebook in test: ",len(set(test_features.filename.values)))

Total notebook in training+validation:  376
Total notebook in test:  94


### Classifier Setup and Feature Engineering

### Data Preprocessing

#### Categorisation of features 
*if necessary

In [12]:
def categorize(df):
    cat = pd.Categorical(df['primary_label'].values, categories=labels)
    y, uniques = pd.factorize(cat)
    df['primary_label'] = np.asarray(y)
    
    cat = pd.Categorical(df['filename'].values)
    files, uniques = pd.factorize(cat)
    df['filename'] = np.asarray(files)
    
    return df,files,cat

In [13]:
temp_df = pd.DataFrame()
#temp_df = temp_df.add(df.copy())
temp_df = pd.concat([df, temp_df, test_features], axis=0)

# Reset index if needed
temp_df = temp_df.reset_index(drop=True)
#temp_df = temp_df.add(test_features.copy())
#print(temp_df)
temp_df.index = range(temp_df.shape[0])
temp_df,cat,files = categorize(temp_df)
#print(cat)
#print(files)
df1 = temp_df[0:df.shape[0]].copy()
df1.index = range(df1.shape[0])
print(df1.primary_label.unique())
print(df1.shape)

df2 = temp_df[df.shape[0]:].copy()
df2.index = range(df2.shape[0])
print(df2.primary_label.unique())
print(df2.shape)

[0 1 2 3 4 5 6 7 8 9]
(7760, 32)
[0 3 5 1 2 4 8 7 6 9]
(1918, 32)


##### for a pre-defined split of validation set for crossfold validation

In [14]:
test_fold = []
for i in range(0,train_features.shape[0]):
    test_fold.append(0)

In [15]:
len(test_fold)

5833

In [16]:
folds = {}
t_fold = 1
count = 1
for each in set(validation_features.filename.values):
    folds[each] = t_fold 
    if count%10 == 0:
        t_fold += 1
    count += 1

In [17]:
len(folds)

94

In [18]:
for idx,row in validation_features.iterrows():
    test_fold.append(folds[row['filename']])

In [19]:
len(test_fold)

7760

##### set the split

In [20]:
ps = PredefinedSplit(test_fold=test_fold)
print(set(test_fold))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}


#### Set up the classifier and indicate conditions to restrict the dataframe we will be working on

In [21]:
model = Classifiers(df1,labels)

conditions = (df1.cell_type == 'code')   #not needed because our dataframe already has only code datapoints
model.apply_conditions_to_dataframe(conditions) #a new dataframe #df_restricted will be created internally

(train,test,indices_train,indices_test)=model.test_train_data_set(df2)

Shape of the restricted dataframe:  (7760, 32)
Resetting index.
train.shape,test.shape
(7760, 32) (1918, 32)


In [22]:
#model.train

In [23]:
df1.isna().sum()

filename                0
cell_type               0
cell_number             0
execution_count         0
linesofcomment          0
linesofcode             0
variable_count          0
function_count          0
text/plain              0
image/png               0
text/html               0
execute_result          0
display_data            0
stream                  0
error                   0
text                    0
comment                 0
code_line_before        0
code_line_after         0
markdown_heading        0
packages_info           0
primary_label           0
helper_functions        0
load_data               0
data_exploration        0
data_preprocessing      0
evaluation              0
modelling               0
prediction              0
result_visualization    0
save_results            0
comment_only            0
dtype: int64

In [24]:
labels

['helper_functions',
 'load_data',
 'data_preprocessing',
 'data_exploration',
 'modelling',
 'evaluation',
 'prediction',
 'result_visualization',
 'save_results',
 'comment_only']

### Feature Engineering (Representation and Selection)

the important feature is the code itself and other features (extended/contextual, comment, code statistics are considered supplementary features in our task) 

In [25]:
#'text','comment','code_line_before','code_line_after', 'markdown_heading', 'packages_info'
features = ['text']
train,test = model.set_lexical_features(features) #same df is modified and contains our new feature column *new_text*

new (lexical) feature column created as 'new_text'


#### apply preprocessing to the text feature
we will use code_text_processing function to process our lexical feature *new_text* we created previously and store them in *text_processed*

In [26]:
train,test = model.preprocessing('new_text')

In [27]:
#dir(model)
#print(model.train_X)


##### tfidf and chi2

In [28]:
#vectorize the text (lexical) in 'text_processed' column
tfidf = TfidfVectorizer(ngram_range=(1,3),use_idf=True,max_df=0.2,min_df=2,stop_words='english') 
X_train,X_test,tfidf = model.vectorization(tfidf)

#use feature selection if necessary (pass k and training label)
k = 1000 #k lexical features to be retained
X_train_features,X_test_features,selector = model.feature_selection(chi2,k,train.primary_label.values)

tfidf transformation finished. shape of the feature vector:  (7760, 48485) (1918, 48485)
Selecting 1000 features...
train,test shape
(7760, 1000) (1918, 1000)


In [29]:
#tfidf_features = tfidf.get_feature_names()
tfidf_features = tfidf.get_feature_names_out()
print("tfidf features: ",len(tfidf_features))
print("before feature selection: ",len(selector.get_support()))
selected_features = selector.get_support()
text_features = [tfidf_features[i] for i in range(len(tfidf.get_feature_names_out())) if selected_features[i]==True]
print("after feature selection: ",len(text_features))

tfidf features:  48485
before feature selection:  48485
after feature selection:  1000


In [30]:
#following set of features are represented in numerical form
#metric features are: ['linesofcomment','linesofcode','variable_count','function_count']
#extended features are: ['filename','cell_number','execution_count','text/plain' , 'image/png', 'text/html', 'execute_result', 'display_data', 'stream', 'error']

stat_features = ['linesofcomment','linesofcode','variable_count','function_count']
X_train_features_,X_test_features_ = model.set_statistical_features(stat_features,X_train_features,X_test_features)

taking feature  linesofcomment   (7760, 1001)
taking feature  linesofcode   (7760, 1002)
taking feature  variable_count   (7760, 1003)
taking feature  function_count   (7760, 1004)
taking feature  linesofcomment   (1918, 1001)
taking feature  linesofcode   (1918, 1002)
taking feature  variable_count   (1918, 1003)
taking feature  function_count   (1918, 1004)
statistical features added


#### Together, text_features and stat_features form the original feature vector of our model

In [31]:
feature_vector = text_features+stat_features
len(feature_vector)

1004

#### Standardize the features 
since our statistical features are scaled in a different way to our text vectors

In [32]:

from sklearn.preprocessing import MinMaxScaler
ss = MinMaxScaler()
X_train_features_ = ss.fit_transform(X_train_features_)
X_test_features_ = ss.transform(X_test_features_)


In [33]:
#print(X_train_features_)

## Classifier Model

## 1. Compare Classifiers 
*default params

In [34]:
ks = [k] 
print(setting)
for k in ks:
    model_pipelines = [('rf',Pipeline([('clf', RandomForestClassifier(random_state=500))])),
                       ('dt',Pipeline([('clf', DecisionTreeClassifier(random_state=500))])),
                       ('gb',Pipeline([('clf', GradientBoostingClassifier(random_state=500))])),
                       ('lsvc',Pipeline([('clf', LinearSVC(random_state=500))])),
                       ('svc',Pipeline([('clf', SVC(random_state=500))])),
                       ('log',Pipeline([('clf', LogisticRegression(random_state=500))])),
                       ]
    
    for pipe in model_pipelines:
        model_name = pipe[0]
        pipeline = pipe[1]
        print('model: ',model_name, k)

        # Singlelabel classifier
        model = OneVsRestClassifier(pipeline).fit(X_train_features_,train['primary_label'])
        prediction = model.predict(X_test_features_)
        acc = accuracy_score(test['primary_label'], prediction)
        model_name = model_name+setting
        acc_default[model_name][k] = acc
        f1score = metrics.f1_score(test['primary_label'], prediction, average = 'weighted')
        f1score_default[model_name][k] = f1score
        print(' acc: ', acc, 'f1score: ', f1score)


_code_scaled
model:  rf 1000
 acc:  0.7106360792492179 f1score:  0.7029370860842276
model:  dt 1000
 acc:  0.5714285714285714 f1score:  0.5957588972841119
model:  gb 1000
 acc:  0.6934306569343066 f1score:  0.6848556784735591
model:  lsvc 1000
 acc:  0.6548488008342023 f1score:  0.6423958034493278
model:  svc 1000
 acc:  0.6595411887382691 f1score:  0.6494170709387062
model:  log 1000
 acc:  0.6418143899895725 f1score:  0.6253374118206392


In [None]:
pd.DataFrame.from_dict(f1score_default).to_pickle(results_path+'f1score_default.pkl')
pd.DataFrame.from_dict(acc_default).to_pickle(results_path+'acc_default.pkl')

### Analysis of results

In [None]:
acf = pd.DataFrame()
f1f = pd.DataFrame()
f1 = pd.read_pickle(results_path+'f1score_default.pkl')
a1 = pd.read_pickle(results_path+'acc_default.pkl')

In [None]:
print(a1.columns)

In [None]:
pd.DataFrame(acc_default).max().max()
pd.DataFrame(f1score_default).max().max()

In [None]:
for each in ['code-stat_','code-extend-proper_','all-features_','_no-code','code_','code-comment_']:
    cols = a1[a1.filter(like=each).columns]
    print(cols)
    for col in cols:
        acf[col] = a1[col]

for each in ['code-stat_','code-extend-proper_','all-features_','_no-code','code_','code-comment_']:
    cols = f1[f1.filter(like=each).columns]
    for col in cols:
        f1f[col] = f1[col]

In [None]:
my_data = f1f[f1f >= f1f.max().max()].to_numpy()
for idrow, row in enumerate(my_data):
    for idcol, col in enumerate(row):
        if not pd.isnull(col):
            print("f1 Value :"+str(col)+" column:"+str(idcol)+" row:"+str(idrow),f1f.columns[idcol],f1f.index[idrow])
            
my_data = acf[acf >= acf.max().max()].to_numpy()
for idrow, row in enumerate(my_data):
    for idcol, col in enumerate(row):
        if not pd.isnull(col):
            print("acc Value :"+str(col)+" column:"+str(idcol)+" row:"+str(idrow),acf.columns[idcol],acf.index[idrow])

In [None]:
f1score_dict, acc_dict = {},{}

for col in f1f.T[1000].index.values:
    f1score_dict[col] = round(f1f[col][1000],3)
    acc_dict[col] = round(acf[col][1000],3)

In [None]:
slabel = pd.DataFrame({'model':f1score_dict.keys(),'accuracy':acc_dict.values(),'f1score':f1score_dict.values()})
slabel.index = slabel['model']
slabel.to_pickle(results_path+'single_label_results.pkl')
slabel

In [None]:
unique_values, counts = np.unique(train["primary_label"], return_counts=True)

print(counts)
import pickle
file_path = 'C:\\Users\\lukas\\OneDrive\\Dokumente\\tuwien\\MachineLearning\\exdds\\results\\single-label-classification\\acc_default.pkl'

with open(file_path, 'rb') as file:
    # Load the object from the file
    loaded_object = pickle.load(file)

print(loaded_object)

## 2. Best parameters using GridSearchCV


In [None]:
results = {}
accuracy_all = {}
f1_score_all = {}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

parameters = {'rf':{}}
parameters['rf']['estimator__rf__criterion'] = ['gini','entropy']
parameters['rf']['estimator__rf__n_estimators'] = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
parameters['rf']['estimator__rf__class_weight']= ['balanced', 'balanced_subsample']

In [None]:
k = 1000
print("******",k,"*******")
model_pipelines = [('rf',Pipeline([('rf', RandomForestClassifier(random_state=500))]))]
    
for pipe in model_pipelines:
    model_name = pipe[0]+'['+content+']'
    pipeline = pipe[1]
    print('model: ',model_name, ' :: ', 'pipeline: ', pipeline)
           
    CV = GridSearchCV(OneVsRestClassifier(pipeline), parameters[pipe[0]], scoring = 'accuracy', n_jobs= 1,cv=ps)
    CV.fit(X_train_features_,train['primary_label'])
    prediction_single = CV.predict(X_test_features_)
    print(' acc: ', accuracy_score(test['primary_label'], prediction_single))
    print(' f1score: ', metrics.f1_score(test['primary_label'], prediction_single, average = 'weighted'))


In [None]:
print(classification_report(test['primary_label'], prediction_single, target_names=labels))

In [None]:
fig, ax = plt.subplots(figsize=(8,7))
sns.heatmap(confusion_matrix(test['primary_label'], prediction_single)
                    , annot=True, fmt='d',
            cmap=sns.diverging_palette(20, 220, n=200),
            linewidths = 1,robust=True, center=0)
plt.ylabel('Actual')
plt.xlabel('Predicted')
title = 'Confusion matrix - Single label RF Classifier'
#ax.set_title(title,fontsize=15,fontweight='bold')
ax.set_yticklabels(labels,rotation=0)
ax.set_xticklabels(labels,rotation=90)
fig.tight_layout()
ax.figure.savefig(results_path+'singlelabelconfusion.eps', format='eps')
print("results saved in ",results_path)
plt.show()


## Modified analysis


In [38]:
content = 'code-comment'
setting = '_'+content+'_'+'scaled'
settings = [m+setting for m in ['rf','dt','gb','lsvc','svc','log'] ]

for model_name in settings:
    acc_default[model_name] = {}
    f1score_default[model_name] = {}

In [40]:
model = Classifiers(df1,labels)

conditions = (df1.cell_type == 'code')   #not needed because our dataframe already has only code datapoints
model.apply_conditions_to_dataframe(conditions) #a new dataframe #df_restricted will be created internally

(train,test,indices_train,indices_test)=model.test_train_data_set(df2)

Shape of the restricted dataframe:  (7760, 32)
Resetting index.
train.shape,test.shape
(7760, 32) (1918, 34)


In [42]:
features = ['text', 'comment']
train,test = model.set_lexical_features(features)

new (lexical) feature column created as 'new_text'


In [43]:
train,test = model.preprocessing('new_text')

In [44]:
#vectorize the text (lexical) in 'text_processed' column
tfidf = TfidfVectorizer(ngram_range=(1,3),use_idf=True,max_df=0.2,min_df=2,stop_words='english') 
X_train,X_test,tfidf = model.vectorization(tfidf)

#use feature selection if necessary (pass k and training label)
k = 1000 #k lexical features to be retained
X_train_features,X_test_features,selector = model.feature_selection(chi2,k,train.primary_label.values)

tfidf transformation finished. shape of the feature vector:  (7760, 56123) (1918, 56123)
Selecting 1000 features...
train,test shape
(7760, 1000) (1918, 1000)


In [45]:
#tfidf_features = tfidf.get_feature_names()
tfidf_features = tfidf.get_feature_names_out()
print("tfidf features: ",len(tfidf_features))
print("before feature selection: ",len(selector.get_support()))
selected_features = selector.get_support()
text_features = [tfidf_features[i] for i in range(len(tfidf.get_feature_names_out())) if selected_features[i]==True]
print("after feature selection: ",len(text_features))

tfidf features:  56123
before feature selection:  56123
after feature selection:  1000


In [46]:
stat_features = ['linesofcomment','linesofcode','variable_count','function_count']
X_train_features_,X_test_features_ = model.set_statistical_features(stat_features,X_train_features,X_test_features)
feature_vector = text_features+stat_features


taking feature  linesofcomment   (7760, 1001)
taking feature  linesofcode   (7760, 1002)
taking feature  variable_count   (7760, 1003)
taking feature  function_count   (7760, 1004)
taking feature  linesofcomment   (1918, 1001)
taking feature  linesofcode   (1918, 1002)
taking feature  variable_count   (1918, 1003)
taking feature  function_count   (1918, 1004)
statistical features added


In [47]:
X_train_features_ = ss.fit_transform(X_train_features_)
X_test_features_ = ss.transform(X_test_features_)

In [48]:
ks = [k] 
print(setting)
for k in ks:
    model_pipelines = [('rf',Pipeline([('clf', RandomForestClassifier(random_state=500))])),
                       ('dt',Pipeline([('clf', DecisionTreeClassifier(random_state=500))])),
                       ('gb',Pipeline([('clf', GradientBoostingClassifier(random_state=500))])),
                       ('lsvc',Pipeline([('clf', LinearSVC(random_state=500))])),
                       ('svc',Pipeline([('clf', SVC(random_state=500))])),
                       ('log',Pipeline([('clf', LogisticRegression(random_state=500))])),
                       ]
    
    for pipe in model_pipelines:
        model_name = pipe[0]
        pipeline = pipe[1]
        print('model: ',model_name, k)

        # Singlelabel classifier
        model = OneVsRestClassifier(pipeline).fit(X_train_features_,train['primary_label'])
        prediction = model.predict(X_test_features_)
        acc = accuracy_score(test['primary_label'], prediction)
        model_name = model_name+setting
        acc_default[model_name][k] = acc
        f1score = metrics.f1_score(test['primary_label'], prediction, average = 'weighted')
        f1score_default[model_name][k] = f1score
        print(' acc: ', acc, 'f1score: ', f1score)


_code_scaled
model:  rf 1000
 acc:  0.700208550573514 f1score:  0.6933024771545233
model:  dt 1000
 acc:  0.5860271115745568 f1score:  0.6087209535319967
model:  gb 1000
 acc:  0.6892596454640251 f1score:  0.6795334448612061
model:  lsvc 1000
 acc:  0.6522419186652764 f1score:  0.6399809780017214
model:  svc 1000
 acc:  0.6386861313868614 f1score:  0.6264256040621544
model:  log 1000
 acc:  0.6381647549530761 f1score:  0.6225801162852651
