First we load the dataset.

In [2]:
import pandas as pd
f = pd.read_csv("drive/MyDrive/final_dataset.csv")

Then we divide the dataset into test and train splits.

In [4]:
from sklearn.model_selection import train_test_split

# Split into train (80%) and test (20%)
f1, test_f = train_test_split(f, test_size=0.2, random_state=42)

# Save to new CSVs
f1.to_csv("train.csv", index=False)
test_f.to_csv("test.csv", index=False)

We first check for any missing data if available and try to take care of it.
The most common way to do so is to replace it with its category's medium.


In [5]:
print(f.isna().sum())

index               0
role                0
committype          0
fileextensions      0
numfileschanged     0
linesadded          0
linesdeleted        0
numcommentsadded    0
timeofcommit        0
commitmessage       0
dtype: int64


# **Lets begin with the main preprocessing of the data:**
First let us convert the target labels and committype into integers.

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
f["role_encoded"] = le.fit_transform(f["role"])
le1 = LabelEncoder()
f["new_committype"] = le1.fit_transform(f["committype"])

Now let us count the number of distinct words available in the commitmessage critiria to give us a basic idea on the max_features used.

In [7]:
import re

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text.split()

words = f["commitmessage"].apply(clean_text)
words = [word for sublist in words for word in sublist]
dwords = set(words)
len(dwords)

1344

In [8]:
import re

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

f["processedMessage"] = f["commitmessage"].apply(clean_text)
f["processedMessage"]

Unnamed: 0,processedMessage
0,implement responsive ui component with dropdow...
1,refactor ui components implement responsive th...
2,feat implement responsive ui layout with modal...
3,refactored ui components for responsive layout...
4,feat implement responsive ui layout for login ...
...,...
1495,implement responsive ui layout for login page ...
1496,implement responsive ui improvements for login...
1497,fixed authentication logic for api login endpo...
1498,implemented responsive modal dropdown componen...


Next we use tf vectorizer to make tf-idf matrix which makes a list of all the words.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=800, stop_words='english', ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(f['processedMessage'])

Then we clean the file extensions.

In [10]:
import re

def clean_fileextensions_safe(val):
    val = str(val)
    # Extract extensions inside quotes
    return re.findall(r"'(.*?)'", val)

# Apply regex cleaning
f['fileextensions_cleaned'] = f['fileextensions'].apply(clean_fileextensions_safe)
f['fileextensions_cleaned']


Unnamed: 0,fileextensions_cleaned
0,[js_ts]
1,[css]
2,[html]
3,[js_ts]
4,"[js_ts, html]"
...,...
1495,"[java_go, css]"
1496,[js_ts]
1497,[sql]
1498,[css]


Then we go ahead to make some changes to the numeric data available to us. And perform some basic operations to make it better and also we scale it.

In [None]:
f['net_lines'] = f['linesadded'] - f['linesdeleted']

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
f[['scaled_net_lines', 'scaled_files_changed', 'scaled_numcommentsadded']] = scaler.fit_transform(f[['net_lines', 'numfileschanged', 'numcommentsadded']])

We convert day and hour to machine readable inputs.

In [None]:
f[['day', 'hour']] = f['timeofcommit'].str.split(" ", expand = True)
f['hour'] = f['hour'].str.replace(':00', '', regex=False)
f['hour'] = pd.to_numeric(f['hour'], errors='coerce')
f['hour'] = f['hour'].fillna(f['hour'].median()).astype(int)
days= {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
f['day_mapped'] = f['day'].map(days)


We convert hours as a sinusoidal function

In [None]:
import numpy as np
f['hour_sin'] = np.sin(2*np.pi*f['hour']/24)