# Model Building

In [2]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df  = pd.read_csv('task data.csv')
df.columns

Index(['task_id', 'title', 'description', 'priority', 'status', 'assigned_to',
       'deadline', 'created_at', 'category'],
      dtype='object')

In [4]:
df.isnull().sum()
df = df.dropna().reset_index(drop=True)

In [5]:
df.isnull().sum()

task_id        0
title          0
description    0
priority       0
status         0
assigned_to    0
deadline       0
created_at     0
category       0
dtype: int64

In [6]:
df.duplicated().sum() 

0

In [7]:
df.describe()

Unnamed: 0,task_id
count,1476.0
mean,751.466802
std,432.185333
min,1.0
25%,377.75
50%,752.5
75%,1124.25
max,1500.0


In [8]:
df.describe(include='object')

Unnamed: 0,title,description,priority,status,assigned_to,deadline,created_at,category
count,1476,1476,1476,1476,1476,1476,1476,1476
unique,13,15,7,5,8,87,60,10
top,Fix Login Bug,Update the navigation bar as per the new UI gu...,Medium,Open,Diana,2025-05-30,2025-04-21,Refactor
freq,181,165,498,533,305,39,34,314


In [9]:
df.head()   

Unnamed: 0,task_id,title,description,priority,status,assigned_to,deadline,created_at,category
0,1,Update Database Schema,Write unit tests for the newly added features.,Medium,Completed,Charlie,2025-03-25,2025-03-17,Feature
1,2,Create Unit Tests,Write unit tests for the newly added features.,High,Open,Eve,2025-05-26,2025-05-06,Documentation
2,3,Fix UI Glitches,Update the navigation bar as per the new UI gu...,Low,Completed,Alice,2025-04-23,2025-04-10,Improvement
3,4,Create Unit Tests,Fix alignment and spacing issues on the mobile...,Medium,In Progress,Eve,2025-05-05,2025-04-24,Feature
4,5,Update Database Schema,Improve the performance of the backend queries.,Medium,Completed,Eve,2025-06-10,2025-04-27,Documentation


In [10]:
df['priority'].value_counts()

priority
Medium          498
High            495
Low             479
measure           1
relationship      1
design            1
oil               1
Name: count, dtype: int64

In [11]:
# Keep only specific categories in 'priority'
allowed_priorities = ['High', 'Medium', 'Low']
df = df[df['priority'].isin(allowed_priorities)].reset_index(drop=True)
df['priority'].value_counts()

priority
Medium    498
High      495
Low       479
Name: count, dtype: int64

In [12]:
df['status'].value_counts()


status
Open           532
In Progress    510
Completed      428
meet             1
finish           1
Name: count, dtype: int64

In [13]:
allowed_statuses = ['Completed', 'In Progress', 'Open']
df = df[df['status'].isin(allowed_statuses)].reset_index(drop=True)
df['status'].value_counts()

status
Open           532
In Progress    510
Completed      428
Name: count, dtype: int64

In [14]:
df['category'].value_counts()

category
Refactor         313
Bug              304
Feature          286
Documentation    285
Improvement      279
resource           1
itself             1
former             1
Name: count, dtype: int64

In [15]:
alloed_categories = ['Refactor', 'Bug', 'Feature', 'Documentation',"Improvement"]
df = df[df['category'].isin(alloed_categories)].reset_index(drop=True)
df['category'].value_counts()

category
Refactor         313
Bug              304
Feature          286
Documentation    285
Improvement      279
Name: count, dtype: int64

In [16]:
df

Unnamed: 0,task_id,title,description,priority,status,assigned_to,deadline,created_at,category
0,1,Update Database Schema,Write unit tests for the newly added features.,Medium,Completed,Charlie,2025-03-25,2025-03-17,Feature
1,2,Create Unit Tests,Write unit tests for the newly added features.,High,Open,Eve,2025-05-26,2025-05-06,Documentation
2,3,Fix UI Glitches,Update the navigation bar as per the new UI gu...,Low,Completed,Alice,2025-04-23,2025-04-10,Improvement
3,4,Create Unit Tests,Fix alignment and spacing issues on the mobile...,Medium,In Progress,Eve,2025-05-05,2025-04-24,Feature
4,5,Update Database Schema,Improve the performance of the backend queries.,Medium,Completed,Eve,2025-06-10,2025-04-27,Documentation
...,...,...,...,...,...,...,...,...,...
1462,1496,Refactor Codebase,Update the schema to support new user roles an...,Medium,Open,Charlie,2025-06-05,2025-04-20,Bug
1463,1497,Implement OAuth2,Implement secure login using OAuth2 protocol.,High,Open,Charlie,2025-05-04,2025-04-19,Improvement
1464,1498,Write API Docs,Update the navigation bar as per the new UI gu...,Medium,In Progress,Diana,2025-06-11,2025-04-29,Improvement
1465,1499,Optimize Backend,Complete the documentation for the public API.,Low,Completed,Diana,2025-06-09,2025-04-22,Improvement


In [17]:
df.drop(columns=['task_id'], inplace=True)
df.drop(columns=['assigned_to'], inplace=True)

In [18]:
df

Unnamed: 0,title,description,priority,status,deadline,created_at,category
0,Update Database Schema,Write unit tests for the newly added features.,Medium,Completed,2025-03-25,2025-03-17,Feature
1,Create Unit Tests,Write unit tests for the newly added features.,High,Open,2025-05-26,2025-05-06,Documentation
2,Fix UI Glitches,Update the navigation bar as per the new UI gu...,Low,Completed,2025-04-23,2025-04-10,Improvement
3,Create Unit Tests,Fix alignment and spacing issues on the mobile...,Medium,In Progress,2025-05-05,2025-04-24,Feature
4,Update Database Schema,Improve the performance of the backend queries.,Medium,Completed,2025-06-10,2025-04-27,Documentation
...,...,...,...,...,...,...,...
1462,Refactor Codebase,Update the schema to support new user roles an...,Medium,Open,2025-06-05,2025-04-20,Bug
1463,Implement OAuth2,Implement secure login using OAuth2 protocol.,High,Open,2025-05-04,2025-04-19,Improvement
1464,Write API Docs,Update the navigation bar as per the new UI gu...,Medium,In Progress,2025-06-11,2025-04-29,Improvement
1465,Optimize Backend,Complete the documentation for the public API.,Low,Completed,2025-06-09,2025-04-22,Improvement


In [19]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['deadline'] = pd.to_datetime(df['deadline'])
df['task_completion_days'] = (df['deadline'] - df['created_at']).dt.days

In [20]:
df

Unnamed: 0,title,description,priority,status,deadline,created_at,category,task_completion_days
0,Update Database Schema,Write unit tests for the newly added features.,Medium,Completed,2025-03-25,2025-03-17,Feature,8
1,Create Unit Tests,Write unit tests for the newly added features.,High,Open,2025-05-26,2025-05-06,Documentation,20
2,Fix UI Glitches,Update the navigation bar as per the new UI gu...,Low,Completed,2025-04-23,2025-04-10,Improvement,13
3,Create Unit Tests,Fix alignment and spacing issues on the mobile...,Medium,In Progress,2025-05-05,2025-04-24,Feature,11
4,Update Database Schema,Improve the performance of the backend queries.,Medium,Completed,2025-06-10,2025-04-27,Documentation,44
...,...,...,...,...,...,...,...,...
1462,Refactor Codebase,Update the schema to support new user roles an...,Medium,Open,2025-06-05,2025-04-20,Bug,46
1463,Implement OAuth2,Implement secure login using OAuth2 protocol.,High,Open,2025-05-04,2025-04-19,Improvement,15
1464,Write API Docs,Update the navigation bar as per the new UI gu...,Medium,In Progress,2025-06-11,2025-04-29,Improvement,43
1465,Optimize Backend,Complete the documentation for the public API.,Low,Completed,2025-06-09,2025-04-22,Improvement,48


In [21]:
df['text'] = df['title'] + ' ' + df['description']

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer # type: ignore

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])

In [23]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['category'])

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [26]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))

               precision    recall  f1-score   support

          Bug       0.21      0.20      0.21        65
Documentation       0.20      0.04      0.06        56
      Feature       0.22      0.22      0.22        63
  Improvement       0.13      0.12      0.13        49
     Refactor       0.21      0.39      0.27        61

     accuracy                           0.20       294
    macro avg       0.20      0.19      0.18       294
 weighted avg       0.20      0.20      0.18       294



In [27]:
# Example custom task string
custom_task = ["Implement a new feature for the application."]

# Transform the string using the trained vectorizer
custom_X = vectorizer.transform(custom_task)

# Predict the category
predicted_label = clf.predict(custom_X)

# Decode the label to the original category name
predicted_category = le.inverse_transform(predicted_label)

print("Predicted Category:", predicted_category[0])

Predicted Category: Bug


In [28]:
import joblib

joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(le, 'label_encoder.joblib')
joblib.dump(clf, 'task_classifier.joblib')

['task_classifier.joblib']