In [1]:
# Importing relevant packages

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [2]:
# Reading and spltting the data into test and train

df = pd.read_csv("train_data.csv")

# Defining the X and y 
X = df.drop(['id','defects'], axis =1)
y = df['defects']

# Splitting the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [3]:
# Function to handle Infinite values

class HandleInf(BaseEstimator, TransformerMixin):
    def __init__(self, value=np.nan):
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = pd.DataFrame(X).replace([np.inf, -np.inf], self.value)
        return X

In [4]:
# Categorizing the columns in X

numerical_features = X_train.select_dtypes(include=['int64','float64']).columns
categorical_features = X_train.select_dtypes(include=['object','bool']).columns

In [5]:
# Preprocessing pipleine

numerical_transformer = Pipeline(steps=[
    ('handle_inf', HandleInf()),
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('to_string', FunctionTransformer(lambda x: x.astype(str), validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

### Random Forest Classifier

In [10]:
# Classification model and metrics

rf_classifier = RandomForestClassifier(n_estimators=100, random_state = 42)

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

# # Splitting into test and train data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Model training
rf_pipeline.fit(X_train, y_train)

#Predicting the output for test data
y_rf_pred = rf_pipeline.predict(X_test)

#Evaluating the model
from sklearn.metrics import accuracy_score, classification_report

rf_accuracy = accuracy_score(y_test,y_rf_pred)
print('Accuracy score from Random Forest Classifier is ', rf_accuracy)

Accuracy score from Random Forest Classifier is  0.8211229946524065


### Logistic Regression Classifier

In [11]:
from sklearn.linear_model import LogisticRegression

log_classifier = LogisticRegression(max_iter = 10000)

log_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

# # Splitting into test and train data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Model training
log_pipeline.fit(X_train, y_train)

#Predicting the output for test data
y_log_pred = log_pipeline.predict(X_test)

#Evaluating the model
from sklearn.metrics import accuracy_score, classification_report

log_accuracy = accuracy_score(y_test,y_log_pred)
print('Accuracy score from Logistic Regression is ', log_accuracy)

Accuracy score from Logistic Regression is  0.8211229946524065


### Predicting the actual test data

In [18]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

X_train = train_df.drop(['id','defects'], axis =1)
y_train = train_df['defects']

X_test = test_df.drop(['id'], axis =1)

id_df = test_df['id']

# Categorizing the train data
num_features = X_train.select_dtypes(include=['int64','float64']).columns
cat_features = X_train.select_dtypes(include=['object','bool']).columns

# Defining the preprocessor
act_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

# Defining the model pipeline
rf_classifier = RandomForestClassifier(n_estimators=100, random_state = 42)

rf_pipeline = Pipeline(steps=[
    ('act_preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

rf_pipeline.fit(X_train, y_train)

# Predicting the target values for test data

predicted = rf_pipeline.predict(X_test)

result_df = pd.DataFrame({'id': id_df, 'defects': predicted.astype(int)})

result_df.to_csv('super_hack_prediction_v1.csv', index = False)

In [14]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32056 entries, 0 to 32055
Data columns (total 36 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               32056 non-null  int64  
 1   McCabeLineCount                  32056 non-null  int64  
 2   McCabeCyclomaticComplexity       32056 non-null  int64  
 3   McCabeEssentialComplexity        32056 non-null  int64  
 4   McCabeDesignComplexity           25125 non-null  float64
 5   HalsteadTotalOperatorsOperands   32056 non-null  int64  
 6   HalsteadVolume                   26140 non-null  float64
 7   HalsteadProgramLength            32056 non-null  float64
 8   HalsteadDifficulty               28835 non-null  object 
 9   HalsteadIntelligence             29743 non-null  float64
 10  HalsteadEffort                   32056 non-null  float64
 11  HalsteadB                        32056 non-null  float64
 12  HalsteadTimeEstima