### Import Useful Libraries

In [1]:
import pandas as pd
import numpy as np
import time
import ast
from sklearn.metrics import hamming_loss, f1_score ,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from google.colab import drive

start_time = time.time()
drive.mount('/content/drive')

Mounted at /content/drive


### Rerieve Dataset

In [2]:
max = 150
path = '/content/drive/MyDrive/Satori Assignment/data/preprocessed_data_'+ str(max) +'.csv'
df = pd.read_csv(path)
df['Target'] = df['Target'].apply(ast.literal_eval)
df['Tag'] = df['Tag'].apply(ast.literal_eval)

print(df.shape)
display(df.head())

(70474, 5)


Unnamed: 0,Text,Tag,Num_Tags,Num_Words,Target
0,get value built encoded viewstate get value bu...,"[asp.net, c#]",2,65,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,logically reorder columns table logically reor...,[sql-server],1,69,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,convert hashbytes varchar convert hashbytes va...,"[sql, sql-server]",2,44,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,mysql error php mysql query mysql error php my...,"[mysql, php]",2,138,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,difference datagrid gridview asp.net differenc...,[asp.net],1,74,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


> We load the preprocessed_data_K CSV file, which stores data after preprocessing with the Top K Tag Combinations.

In [3]:
unique_tags = set([tag for sublist in df['Tag'] for tag in sublist])

print(f"Number of Unique Tags: {len(unique_tags)}")
print(unique_tags)

Number of Unique Tags: 104
{'rust', 'sql-server-2008', 'github', 'swing', 'html', 'scala', 'wordpress', 'android-studio', 'codeigniter', 'c++11', 'entity-framework', 'excel-vba', 'facebook', 'angular2', 'css', 'c#', 'qt', 'prolog', 'mongodb', 'xslt', 'matplotlib', 'reactjs', 'angularjs', 'ios', 'linq', 'python-2.7', 'javascript', 'facebook-graph-api', 'ruby', 'css3', 'sqlite', 'objective-c', 'android-fragments', 'winforms', 'c++', 'templates', 'multithreading', 'ggplot2', 'jquery-ui', 'mysql', 'vim', 'd3.js', 'magento', 'symfony2', 'spring', 'java', 'regex', 'ruby-on-rails', 'r', 'sql-server', 'ruby-on-rails-3', 'vb.net', 'elasticsearch', 'iphone', 'batch-file', 'android', 'tsql', 'svn', 'php', 'jquery', 'wpf', 'ember.js', 'android-intent', 'meteor', 'ajax', 'python', 'asp.net-mvc', 'activerecord', 'android-layout', 'excel', 'hibernate', 'algorithm', 'data.table', 'list', 'numpy', 'postgresql', 'vba', 'oracle', 'perl', 'sql', 'c', 'git', 'bash', 'arrays', 'twitter-bootstrap', 'django',

### Split data

#### Convert Multilabel to Single-Label (for Split Purposes)

In [4]:
def calculate_tag_combinations(temp_df):
    """
    Create a dataframe 'tag_combination', which stores the unique combination of Tags and the number of appearance
    """

    if temp_df.empty:
        print('Error: Provided DataFrame is empty!')
        return

    temp_df['Tag'] = temp_df['Tag'].apply(sorted)
    temp_df['Tag_Tuple'] = temp_df['Tag'].apply(tuple)
    tag_combinations = temp_df.groupby('Tag_Tuple').size().reset_index(name='Count')
    temp_df.drop(columns=['Tag_Tuple'], inplace=True)
    tag_combinations = tag_combinations.sort_values(by='Count', ascending=False)

    sum_count = tag_combinations['Count'].sum()
    all_tags = [tag for tag_tuple in tag_combinations['Tag_Tuple'] for tag in tag_tuple]
    num_unique_tags = len(set(all_tags))

    print(f"Number of Questions of Tag Combinations: {sum_count}")
    print(f"Number of unique tags of Tag Combinations: {num_unique_tags}")
    print(f"Minimum Frequency of Tag Combinations: {tag_combinations['Count'].iloc[-1]}\n")
    display(tag_combinations)

    return tag_combinations

original_tag_combinations = calculate_tag_combinations(df)

Number of Questions of Tag Combinations: 70474
Number of unique tags of Tag Combinations: 104
Minimum Frequency of Tag Combinations: 144



Unnamed: 0,Tag_Tuple,Count
7,"(android,)",3630
92,"(javascript, jquery)",2700
85,"(java,)",2519
91,"(javascript,)",2501
116,"(php,)",2190
...,...,...
123,"(prolog,)",145
118,"(php, symfony2)",144
88,"(java, spring)",144
16,"(angularjs, angularjs-directive)",144


> Display in a proper way the distribution of Tag Combinations. This will be helpful so as to reassure in a given dataframe data are well distributed and randomly picken. This is a way to convert multilabel problem to single class problem and manually split the data on Tag Combinations and not on each single Tag.

In [5]:
def split_data(tag_combinations, df, create_val=True, train_ratio=0.8, val_ratio=0.1):
    # Shuffle the dataframe
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

    val_ratio = val_ratio if create_val else 0

    # Dicts to store counts for each combination
    train_counts = {combination: 0 for combination in tag_combinations['Tag_Tuple']}
    val_counts = {combination: 0 for combination in tag_combinations['Tag_Tuple']}
    test_counts = {combination: 0 for combination in tag_combinations['Tag_Tuple']}

    # Appropriate ratio of tag combinations for each set
    for index, row in tag_combinations.iterrows():
        combination = row['Tag_Tuple']
        count = row['Count']

        train_counts[combination] += int(count*train_ratio)
        val_counts[combination] += int(count*val_ratio)
        test_counts[combination] += count - int(count*train_ratio) - int(count*val_ratio)

    # Store data to list for each set
    train_data, val_data, test_data = [], [], []

    # Iterate over rows in df and assign to train, val, or test sets
    for index, row in df_shuffled.iterrows():
        combination = row['Tag']
        if train_counts[tuple(combination)] > 0:
            train_data.append(row)
            train_counts[tuple(combination)] -= 1
        elif test_counts[tuple(combination)] > 0:
            test_data.append(row)
            test_counts[tuple(combination)] -= 1
        elif val_counts[tuple(combination)] > 0:
            val_data.append(row)
            val_counts[tuple(combination)] -= 1

    assert len(train_data) + len(val_data) + len(test_data) == len(df), "Total number of examples in train, val, and test sets does not match the total number of examples in the original dataframe"
    return pd.DataFrame(train_data), pd.DataFrame(val_data), pd.DataFrame(test_data)

train_df, val_df, test_df = split_data(original_tag_combinations, df, create_val=False)

> Now with this generic function we utilise a logic with dicts in order to track the number of times of each Tag Combination and finally retrieve with a modifiable ratio train, test (and it is desired validation) sets.

#### Train, Val and Test Dataframes

In [6]:
train_df

Unnamed: 0,Text,Tag,Num_Tags,Num_Words,Target
0,missing assembly reference using natupnplib .n...,[c#],1,94,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,image background html page image background ht...,"[css, html]",2,95,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,onclick clickablespan working urlspan onclick ...,[android],1,118,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,return false works answer correct return false...,[jquery],1,365,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,given array v need find two indices j v j v j ...,[algorithm],1,87,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
59767,set path browsing popup set path browsing popu...,[batch-file],1,231,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
59890,filter warnings console java using deprecated ...,"[java, swing]",2,77,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
60556,want make ajax pause show progress bar 2s cont...,"[ajax, jquery, php]",3,213,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
60640,one ajax call block ajax call one ajax call bl...,"[ajax, jquery, php]",3,104,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
test_df

Unnamed: 0,Text,Tag,Num_Tags,Num_Words,Target
51921,drawing line finger touch iphone drawing line ...,"[iphone, objective-c]",2,41,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
52152,number white spaces string objective-c number ...,"[iphone, objective-c]",2,21,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
52826,generate patch svn histroy non svn target gene...,[svn],1,105,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
53026,use f# web developer express edition use f# we...,[f#],1,39,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
53091,reading file works iphone 6.1 simulator iphone...,[ios],1,164,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
70469,django mvt design code models views django mvt...,"[django, python]",2,85,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
70470,wpf showdialog display dialog wpf showdialog d...,[wpf],1,73,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
70471,chain async ajax function loop chain async aja...,"[ajax, jquery]",2,134,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
70472,necessary learn code access security cas neces...,[c#],1,28,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


#### Reassure the well distributed split

In [8]:
train_tag_combinations = calculate_tag_combinations(train_df)

Number of Questions of Tag Combinations: 56325
Number of unique tags of Tag Combinations: 104
Minimum Frequency of Tag Combinations: 115



Unnamed: 0,Tag_Tuple,Count
7,"(android,)",2904
92,"(javascript, jquery)",2160
85,"(java,)",2015
91,"(javascript,)",2000
116,"(php,)",1752
...,...,...
123,"(prolog,)",116
118,"(php, symfony2)",115
88,"(java, spring)",115
16,"(angularjs, angularjs-directive)",115


In [9]:
test_tag_combinations = calculate_tag_combinations(test_df)

Number of Questions of Tag Combinations: 14149
Number of unique tags of Tag Combinations: 104
Minimum Frequency of Tag Combinations: 29



Unnamed: 0,Tag_Tuple,Count
7,"(android,)",726
92,"(javascript, jquery)",540
85,"(java,)",504
91,"(javascript,)",501
116,"(php,)",438
...,...,...
88,"(java, spring)",29
16,"(angularjs, angularjs-directive)",29
123,"(prolog,)",29
118,"(php, symfony2)",29


> We are pretty sure right now that each combination is well distributed and thus this will lead to better model's performance. Balanced datasets enable models with way more probabilities to capture patterns.

### TFidf Feature Extraction

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,3))

X_train = tfidf_vectorizer.fit_transform(train_df['Text'])
Y_train = np.array(train_df['Target'].tolist())
print('X_train shape:', X_train.shape)
print('Y_train shape:', Y_train.shape)

X_test = tfidf_vectorizer.transform(test_df['Text'])
Y_test = np.array(test_df['Target'].tolist())
print('X_test shape:', X_test.shape)
print('Y_test shape:', Y_test.shape)

X_train shape: (56325, 20000)
Y_train shape: (56325, 104)
X_test shape: (14149, 20000)
Y_test shape: (14149, 104)


So we apply TFIDF with max_features=20000 and considering unigrams,bi-grams and tri-grams, cause there are many important combinations like 'unix environment' or 'android application'.

### Sklearn Models

> In order to experiment with different Sklearn models, create a generic function to monitor each algorithm's performance.

In [11]:
def train_and_evaluate_model(model, model_name):
    start_training = time.time()
    # Train and Predict
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)

    # Evaluation metrics
    hamming_loss_val = round(hamming_loss(Y_test, Y_pred), 2)
    micro_f1 = round(f1_score(Y_test, Y_pred, average='micro'), 2)
    macro_f1 = round(f1_score(Y_test, Y_pred, average='macro'), 2)

    print("-" * 90)
    print(f"> {model_name}")
    print(f"Hamming Loss: {hamming_loss_val}\tMicro-F1: {micro_f1}\t\tMacro-F1: {macro_f1}\t\tTime: {int((time.time()-start_training))} secs")

    return Y_pred

> **[Hamming Loss](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html)**: Hamming loss measures the fraction of labels that are incorrectly predicted. It treats each label prediction independently and penalizes each misclassification equally.

> **[Micro-F1 Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)**: This metric computes the F1 score for each label individually and then averages them using the micro-average strategy. It is suitable for imbalanced datasets where some labels may have significantly fewer instances than others. Micro-F1 gives equal weight to each instance and each label. Micro-F1 is suitable when you want to evaluate the overall performance of the model without considering label imbalances.

> **[Macro-F1 Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)**: Macro-F1 computes the F1 score for each label individually and then averages them using the macro-average strategy. Each label contributes equally to the average, regardless of the number of instances. Macro-F1 is useful when you want to evaluate the performance of the model on each label separately and give equal importance to each label.

> Using these three metrics together provides a **holistic** view of the model's performance in multi-label classification problem, considering different aspects such as overall accuracy, label-wise accuracy, and the balance between precision and recall.






> The Hamming loss is the fraction of labels that are incorrectly predicted. Ranges from [0,1] and it's logic is the smaller the better. With the 2 following simple and concrete examples it would be easier to understand.

----

    example_pred = [0, 0, 0, 0, 1]
    example_true = [0, 0, 0, 1, 0]
    hamming_loss(example_true, example_pred)
    
> In the above example, Hamming Loss = 0.4.

----

    example_pred = [0, 0, 0, 0, 1]
    example_true = [0, 0, 0, 1, 1]
    hamming_loss(example_true, example_pred)
    
> In the above example, Hamming Loss = 0.2.

----

    

In [12]:
init = time.time()

# Logistic Regression
reg_model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
train_and_evaluate_model(reg_model, 'Logistic Regression')

# Stohastic Gradient Descent
sgd_model = OneVsRestClassifier(SGDClassifier())
train_and_evaluate_model(sgd_model, 'SGD')

# Linear SVC
svc_model = OneVsRestClassifier(LinearSVC())
Y_pred = train_and_evaluate_model(svc_model, 'Linear SVC')

# Multinomial Naive Bayes
nb_model = OneVsRestClassifier(MultinomialNB(alpha=1.0))
train_and_evaluate_model(nb_model, 'Multinomial Naive Bayes')

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
train_and_evaluate_model(rf_model, 'Random Forest')

print("-" * 90)
print("\nCell Runtime:", int((time.time()-init)/60), "minutes")

------------------------------------------------------------------------------------------
> Logistic Regression
Hamming Loss: 0.01	Micro-F1: 0.7		Macro-F1: 0.54		Time: 193 secs
------------------------------------------------------------------------------------------
> SGD
Hamming Loss: 0.01	Micro-F1: 0.7		Macro-F1: 0.57		Time: 20 secs
------------------------------------------------------------------------------------------
> Linear SVC
Hamming Loss: 0.01	Micro-F1: 0.77		Macro-F1: 0.69		Time: 35 secs
------------------------------------------------------------------------------------------
> Multinomial Naive Bayes
Hamming Loss: 0.01	Micro-F1: 0.55		Macro-F1: 0.18		Time: 5 secs
------------------------------------------------------------------------------------------
> Random Forest
Hamming Loss: 0.01	Micro-F1: 0.68		Macro-F1: 0.42		Time: 782 secs
------------------------------------------------------------------------------------------

Cell Runtime: 17 minutes


> We choose lightweight, cheap, efficient and more simple structured algorithms in order to experiment with and ultimately set our Baseline model.

> We conclude that the best classifier, for our Baseline Model would **Linear SVC**, who except of each descent performance based on our metrics, he's incredibly fast and time efficient.

### Best Classifier

#### Cross Validation

In [13]:
hamming_losses, micro_f1_scores, macro_f1_scores = [], [], []
kf = KFold(n_splits=5)

# Perform manual k-fold cross-validation
for fold_idx, (train_index, test_index) in enumerate(kf.split(X_train), 1):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    Y_train_fold, Y_test_fold = Y_train[train_index], Y_train[test_index]

    # Train and Predict
    svc_model.fit(X_train_fold, Y_train_fold)
    Y_pred_fold = svc_model.predict(X_test_fold)

    # Each Fold's Metrics
    hamming_loss_fold = round(hamming_loss(Y_test_fold, Y_pred_fold), 2)
    micro_f1_fold = round(f1_score(Y_test_fold, Y_pred_fold, average='micro'), 2)
    macro_f1_fold = round(f1_score(Y_test_fold, Y_pred_fold, average='macro'), 2)

    # Store metrics
    hamming_losses.append(hamming_loss_fold)
    micro_f1_scores.append(micro_f1_fold)
    macro_f1_scores.append(macro_f1_fold)

    print("-" * 30)
    print(f"KFold {fold_idx}:")
    print(f"\tHamming Loss: {hamming_loss_fold}")
    print(f"\tMicro-F1: {micro_f1_fold}")
    print(f"\tMacro-F1: {macro_f1_fold}")

print("-" * 30 + '\n')
print("-" * 90)
print(f"CV Mean Hamming Loss: {round(np.mean(hamming_losses), 2)}\tCV Mean Micro-F1: {round(np.mean(micro_f1_scores), 2)}\t\tCV Mean Macro-F1: {round(np.mean(macro_f1_scores), 2)}")
print("-" * 90)

------------------------------
KFold 1:
	Hamming Loss: 0.01
	Micro-F1: 0.76
	Macro-F1: 0.68
------------------------------
KFold 2:
	Hamming Loss: 0.01
	Micro-F1: 0.76
	Macro-F1: 0.68
------------------------------
KFold 3:
	Hamming Loss: 0.01
	Micro-F1: 0.76
	Macro-F1: 0.68
------------------------------
KFold 4:
	Hamming Loss: 0.01
	Micro-F1: 0.76
	Macro-F1: 0.69
------------------------------
KFold 5:
	Hamming Loss: 0.01
	Micro-F1: 0.76
	Macro-F1: 0.68
------------------------------

------------------------------------------------------------------------------------------
CV Mean Hamming Loss: 0.01	CV Mean Micro-F1: 0.76		CV Mean Macro-F1: 0.68
------------------------------------------------------------------------------------------


#### Classification Report for each Tag

In [14]:
mlb = MultiLabelBinarizer()
tag_matrix = mlb.fit_transform(df['Tag'].tolist())

> Use MultiLabelBinarizer to retrieve Tag labels.

In [15]:
print('*'*60)
for i in range(Y_train.shape[1]):
    print(mlb.classes_[i])
    print(classification_report(Y_test[:,i], Y_pred[:,i]),'\n'+'*'*60)

************************************************************
.net
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     14011
           1       0.29      0.04      0.08       138

    accuracy                           0.99     14149
   macro avg       0.64      0.52      0.54     14149
weighted avg       0.98      0.99      0.99     14149
 
************************************************************
activerecord
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14119
           1       0.62      0.27      0.37        30

    accuracy                           1.00     14149
   macro avg       0.81      0.63      0.69     14149
weighted avg       1.00      1.00      1.00     14149
 
************************************************************
ajax
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14028
           1       0.75      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99     13921
           1       0.78      0.54      0.64       228

    accuracy                           0.99     14149
   macro avg       0.89      0.77      0.82     14149
weighted avg       0.99      0.99      0.99     14149
 
************************************************************
ruby-on-rails
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13814
           1       0.95      0.80      0.87       335

    accuracy                           0.99     14149
   macro avg       0.97      0.90      0.93     14149
weighted avg       0.99      0.99      0.99     14149
 
************************************************************
ruby-on-rails-3
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14100
           1       0.31      0.08      0.13        49

    accuracy                

In [16]:
print("Total time to run the notebook:", int((time.time()-start_time)/60), "minutes")

Total time to run the notebook: 21 minutes
