In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [None]:
# 1. Load and inspect the data
file_path = r"C:\Users\Vettri\Downloads\CS\Data\new_train_sample.csv"  # File path for training data
df = pd.read_csv(file_path)

In [None]:
# Dropping the Timestamp column
df = df.drop(columns=['Timestamp'], errors='ignore')

In [None]:
# Understanding data
print("Dataset Shape:", df.shape)
print("\nFirst few rows of the dataset:")
print(df.head())
print("\nData Summary:")
print(df.info())

In [None]:
# Target Variable Distribution
target_column = 'IncidentGrade'
print("\nTarget Distribution:")
print(df[target_column].value_counts())
sns.countplot(x=target_column, data=df)
plt.title('Target Variable Distribution')
plt.show()

In [None]:
# 2. Exploratory Data Analysis (EDA)
# Check for missing values
missing_percentage = df.isnull().sum() / len(df) * 100
print("\nMissing Values Percentage per Column:")
print(missing_percentage)

# Identify columns to drop based on missing value thresholds
drop_columns = missing_percentage[missing_percentage > 50].index
drop_columns_less_than_5 = missing_percentage[missing_percentage < 5].index

# Drop columns with more than 50% missing values
df.drop(columns=drop_columns, inplace=True)

# Keep columns with less than 5% missing values and impute the remaining
columns_to_impute = df.columns.difference(drop_columns_less_than_5)
for col in columns_to_impute:
    if df[col].dtype == 'object':  # Categorical
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:  # Numeric
        df[col].fillna(df[col].median(), inplace=True)

# Summary of missing data handling
print(f"\nColumns dropped (more than 50% missing): {drop_columns}")
print(f"\nColumns kept (with missing data < 50% and > 5%): {columns_to_impute}")

# Summary Statistics
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# 3. Data Preprocessing
# (a) Encoding Categorical Variables
le_target = LabelEncoder()
df[target_column] = le_target.fit_transform(df[target_column])

# Encode other categorical columns
for col in df.select_dtypes(include=['object']).columns:
    if col != target_column:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# (b) Feature Scaling (optional for certain models)
scaler = StandardScaler()

# Ensure only numeric columns are selected for scaling
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
num_cols = [col for col in num_cols if df[col].dtype in ['int64', 'float64']]  # Double check numeric columns

# Scale only numeric columns
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
# 4. Data Splitting
X = df.drop(target_column, axis=1)
y = df[target_column]

# Stratified Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [None]:
# 5. Handle Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check distribution after SMOTE
print("\nTarget Distribution After SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

In [None]:
# 6. Model Selection and Training
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and Evaluate Models
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test)

    # Evaluation Metrics
    print(f"\n{name} Performance Metrics:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))
    print("Precision:", precision_score(y_test, y_pred, average='macro'))
    print("Recall:", recall_score(y_test, y_pred, average='macro'))


In [None]:
# 7. Feature Importance (for Random Forest)
rf_model = models['Random Forest']
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nRandom Forest Feature Importances:")
print(feature_importances)

In [None]:
# 8. Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
    scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=cv, scoring='f1_macro')
    print(f"\nCross-Validation Macro F1 Scores for {name}:")
    print(scores)
    print("Mean F1 Score:", scores.mean())

# **Test the models on the test dataset**
# Load the test dataset
test_file_path = r"C:\Users\Vettri\Downloads\CS\Data\GUIDE_Test.csv"  # Test file path
test_df = pd.read_csv(test_file_path)

# Preprocess test dataset (similar to the train dataset)
# Drop the Timestamp column if present
test_df = test_df.drop(columns=['Timestamp'], errors='ignore')

# Handle missing data in the test dataset
missing_percentage_test = test_df.isnull().sum() / len(test_df) * 100
drop_columns_test = missing_percentage_test[missing_percentage_test > 50].index
drop_columns_less_than_5_test = missing_percentage_test[missing_percentage_test < 5].index

# Drop columns with more than 50% missing values in test dataset
test_df.drop(columns=drop_columns_test, inplace=True)

# Keep columns with less than 5% missing values and impute the remaining
columns_to_impute_t = test_df.columns.difference(drop_columns_less_than_5_test)
for col in columns_to_impute_t:
    if test_df[col].dtype == 'object':  # Categorical
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)
    else:  # Numeric
        test_df[col].fillna(test_df[col].median(), inplace=True)

# Encoding categorical variables in the test dataset (same as train dataset)
for col in test_df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    test_df[col] = le.fit_transform(test_df[col])

# Feature scaling for test data
test_df[num_cols] = scaler.transform(test_df[num_cols])

# Prepare the test data for prediction
X_test_final = test_df.drop(target_column, axis=1)
y_test_final = test_df[target_column]

# Evaluate each model on the test set
for name, model in models.items():
    print(f"\nTesting {name} on the test dataset...")
    y_pred_test = model.predict(X_test_final)

    # Evaluation Metrics on Test Set
    print(f"\n{name} Test Performance Metrics:")
    print(confusion_matrix(y_test_final, y_pred_test))
    print(classification_report(y_test_final, y_pred_test))
    print("Macro F1 Score:", f1_score(y_test_final, y_pred_test, average='macro'))
    print("Precision:", precision_score(y_test_final, y_pred_test, average='macro'))
    print("Recall:", recall_score(y_test_final, y_pred_test, average='macro'))
