In [9]:
# Importing the necessary tools
import pandas as pd
import numpy as np  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from xgboost import XGBClassifier
import scipy.sparse
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [10]:
df=pd.read_json('datafinal.json')
df.head(n=30)

Unnamed: 0,ID,Text,ContainsCode,CodeList
0,1,Test-driven development (TDD) is a software de...,,
1,2,Software development is an exciting field that...,False,
2,3,The development process often starts with a co...,,
3,4,Another important aspect of software developme...,True,git commit -m 'Initial commit'
4,5,"In the world of software development, language...",False,
5,6,There are many tools and frameworks available ...,,
6,7,"In the world of software development, the most...",,
7,8,Software development is an intricate process t...,False,
8,9,"In this modern era, software development has i...",True,public class HelloWorld { public static void m...
9,10,"In the realm of software development, one ofte...",False,


In [11]:
df['ContainsCode']=df['ContainsCode'].astype(bool)

In [29]:
df.nunique()

ID              1908
Text            1907
ContainsCode       2
CodeList         434
dtype: int64

In [12]:
df.dtypes

ID               int64
Text            object
ContainsCode      bool
CodeList        object
dtype: object

In [13]:

mlb = MultiLabelBinarizer()
s1 = df["CodeList"]
t1 = mlb.fit_transform(s1)
t1

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:

df['Text'] = df['Text'].str.replace('[^\w\s]', '').str.lower()

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(df[['Text', 'ContainsCode']], t1, test_size=0.3, random_state=42)

# TF-IDF vectorization for the 'Text' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['Text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['Text'])

# Combine TF-IDF features with 'ContainsCode' column
X_train_combined = scipy.sparse.hstack([X_train_tfidf, X_train['ContainsCode'].values.reshape(-1, 1)])
X_test_combined = scipy.sparse.hstack([X_test_tfidf, X_test['ContainsCode'].values.reshape(-1, 1)])

# Define the hyperparameter grid for XGBClassifier
param_grid = {
    'estimator__n_estimators': np.arange(50, 200, 10),  # Adjust the range as needed
    'estimator__max_depth': np.arange(3, 10),  # Adjust the range as needed
    'estimator__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],  # Adjust the values as needed
}

# Create an instance of MultiOutputClassifier with XGBClassifier
base_estimator = XGBClassifier()

# Create an instance of RandomizedSearchCV
random_search = RandomizedSearchCV(
    MultiOutputClassifier(base_estimator),
    param_distributions=param_grid,
    scoring='accuracy',
    cv=5,
    n_iter=10,  # Number of random parameter combinations to try
    verbose=1,  # Increase verbosity for progress updates
    n_jobs=-1,  # Use all available CPU cores
)

# Fit the RandomizedSearchCV object to your training data
random_search.fit(X_train_combined, y_train)

# Retrieve the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best estimator for prediction
best_estimator = random_search.best_estimator_
y_pred = best_estimator.predict(X_test_combined)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


  df['Text'] = df['Text'].str.replace('[^\w\s]', '').str.lower()


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'estimator__n_estimators': 50, 'estimator__max_depth': 4, 'estimator__learning_rate': 0.1}
Accuracy: 74.00%


This code block is a machine learning pipeline that uses the XGBoost algorithm to classify text data. The first few lines of code preprocess the text data by removing punctuation and converting all text to lowercase. The dataset is then split into training and testing sets. The text data is vectorized using the TF-IDF method, which converts the text into numerical features that can be used for machine learning. The `ContainsCode` column is also included as a feature. The resulting feature matrices are combined using `scipy.sparse.hstack()`. 

The hyperparameters for the XGBoost algorithm are tuned using `RandomizedSearchCV()`, which performs a randomized search over a range of hyperparameters to find the best combination. The best hyperparameters are then used to train the XGBoost model on the training data. The model is then used to predict the labels of the test data, and the accuracy of the model is evaluated using `accuracy_score()`.

In [25]:
# Calculate F1 score, precision, and recall for multi-label classification
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f'Weighted F1 Score: {f1 * 100:.2f}%')
print(f'Weighted Precision: {precision * 100:.2f}%')
print(f'Weighted Recall: {recall * 100:.2f}%')

Weighted F1 Score: 76.96%
Weighted Precision: 80.36%
Weighted Recall: 77.17%


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


This code block calculates the F1 score, precision, and recall for the XGBoost model that was trained in the previous code block. These metrics are commonly used to evaluate the performance of a classification model. 

The `average='weighted'` parameter in the `f1_score()`, `precision_score()`, and `recall_score()` functions specifies that the weighted average of the F1 score, precision, and recall should be calculated. The weighted average takes into account the number of samples in each class, which is important when dealing with imbalanced datasets. 

The F1 score is the harmonic mean of precision and recall, and it provides a balance between the two metrics. Precision is the ratio of true positives to the total number of predicted positives, and it measures the model's ability to correctly identify positive samples. Recall is the ratio of true positives to the total number of actual positives, and it measures the model's ability to correctly identify all positive samples. 

We are checking the F1 score, precision, and recall of the model to evaluate its performance on the test data. These metrics provide a quantitative measure of how well the model is able to classify the test data. A high F1 score, precision, and recall indicate that the model is performing well, while a low score indicates that the model may need further tuning.

In [15]:
y_pred
y_prediction =pd.DataFrame(y_pred)

In [16]:
y_prediction

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83,84,85,86,87,88,89,90,91,92
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
569,1,1,0,0,0,0,0,1,1,1,...,1,1,1,0,0,1,0,1,0,1
570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
y_prediction.to_csv('prediction.csv', index=False)
