In [104]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from sklearn.preprocessing import (
    PowerTransformer,
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    AdaBoostClassifier,
    StackingClassifier,
    VotingClassifier,
)
from xgboost import XGBClassifier
from sklearn.svm import SVC, OneClassSVM
from sklearn.neural_network import MLPClassifier

# from hmmlearn import hmm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, roc_auc_score
from lightgbm import LGBMClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pickle

warnings.filterwarnings("ignore")

In [105]:
voting_clf = VotingClassifier(
    estimators=[
        ("ab", RandomForestClassifier()),
        ("gb", GradientBoostingClassifier()),
        ("lgbm", LGBMClassifier(verbose=-1)),
    ],
    voting="hard",  # 'hard' for majority voting, 'soft' for weighted average probabilities
)
# RandomForestClassifier(class_weight='balanced', n_estimators=100)
# model = LGBMClassifier(verbose=-1)
# model = GradientBoostingClassifier()
# model = voting_clf
model = XGBClassifier()

In [106]:
import dask.dataframe as dd

# Load data
excel_file_path = "./train.csv"
df = dd.read_csv(excel_file_path)
df=df.compute()
# df = df.head(1000)

In [107]:
def remove_outliers(df, outlier_dict):
    for distribution, category in outlier_dict.items():
        if distribution == "normal":
            for cat in category:
                upper_limit = df[cat].mean() + 3 * df[cat].std()
                lower_limit = df[cat].mean() - 3 * df[cat].std()
                print(cat, upper_limit, lower_limit)
                # capping
                df[cat] = np.where(df[cat] > upper_limit,upper_limit,np.where(df[cat] < lower_limit, lower_limit, df[cat]))
                # Trimming
                # df = df[(df[cat] < upper_limit) & (df[cat] > lower_limit)]
        elif distribution == "skew":
            for cat in category:
                percentile25 = df[cat].quantile(0.25)
                percentile75 = df[cat].quantile(0.75)
                iqr = percentile75 - percentile25
                upper_limit = percentile75 + 1.5 * iqr
                lower_limit = percentile25 - 1.5 * iqr
                print(cat, upper_limit, lower_limit)
                # capping
                df[cat] = np.where(
                    df[cat] > upper_limit,
                    upper_limit,
                    np.where(df[cat] < lower_limit, lower_limit, df[cat]),
                )
                # Trimming
                # df = df[(df[cat] < upper_limit) & (df[cat] > lower_limit)]
    return df

In [108]:
# how to know no. of bins

from scipy import stats
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import KBinsDiscretizer


outlier_dict = {
    "normal": [
        'Vintage', 
    ],
    "skew": [
        'Annual_Premium', 
        'Age'
    ],
}

def veh_a(Vehicle_Damage):
  if Vehicle_Damage == 'Yes':
    return 1
  else:
    return 0

def pre_process(df):
    df['Vehicle_Damage'] = df['Vehicle_Damage'].apply(veh_a)
    df['Vehicle_Age'] = df['Vehicle_Age'].astype('category')
    df = pd.get_dummies(df, columns=['Vehicle_Age'], dtype=int)
    df['Gender'] = df['Gender'].astype('category')
    df = pd.get_dummies(df, columns=['Gender'],drop_first=True, dtype=int)
    # df['Age'] = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile').fit_transform(df[['Age']])
    df['Region_Code'] = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile').fit_transform(df[['Region_Code']])
    return df


df = pre_process(df)
df = remove_outliers(df, outlier_dict)
df.to_csv('df.csv', index=False)

Vintage 410.7842324155851 -83.6382324155851
Annual_Premium 61652.875 2505.875
Age 83.125 -9.875


In [109]:
from imblearn.over_sampling import SMOTE


# Define features and target
def get_X_Y(df):
    X = df.drop(
        columns=[
            "id",
            "Response",
            "Driving_License",
            "Vehicle_Age_> 2 Years"
        ]
    )
    Y = df["Response"]
    return X, Y

X, Y = get_X_Y(df)

In [110]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=5
)

# for x in X_train.columns:
#     print(X_train[x].value_counts())
print(X_train.shape)
smote = SMOTE()
X_train, Y_train = smote.fit_resample(X_train, Y_train)

# Check columns
# X_train, X_test = X, X
# Y_train, Y_test = Y, Y
# print(Y_train.value_counts())


(800, 10)


In [111]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Save the correlation matrix to a CSV file
correlation_matrix.to_csv('correlation_matrix.csv', index=True)

In [112]:
# Get the list of categorical column names
numerical_features = X_train.columns
categorical_feat_ord = [
    "Vehicle_Age", "Vehicle_Damage"
]
categorical_feat_nom = [
    "Gender",
]
cat = categorical_feat_ord + categorical_feat_nom
numerical_features = [item for item in numerical_features if item not in cat]
numerical_features = ['Annual_Premium', 'Vintage', 'Policy_Sales_Channel']

In [113]:
# Separate transformers for categorical and numerical features

from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

trf = PowerTransformer()
# trf = FunctionTransformer(np.sin)

numerical_transformer = Pipeline(
    steps=[
        # ("log", trf),
        ("scaler", MinMaxScaler()),  # StandardScaler MinMaxScaler
        # ('pca',PCA(n_components=1)) 
    ]
)
categorical_transformer_onehot = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
categorical_transformer_ordinal = Pipeline(
    steps=[
        ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

In [114]:
from sklearn.model_selection import GridSearchCV


preprocessor = ColumnTransformer(
    transformers=[
        # ("cat", categorical_transformer_onehot, categorical_feat_nom),
        # ("cat_1", categorical_transformer_ordinal, categorical_feat_ord),
        ("num", numerical_transformer, numerical_features),
    ]
)
# Define the pipeline
pipeline = Pipeline([("preprocessor", preprocessor), ("feature_selection", SelectKBest(score_func=chi2, k='all')), ("model", model)])

# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

ValueError: Found array with 0 feature(s) (shape=(1422, 0)) while a minimum of 1 is required by SelectKBest.

In [None]:
# Save the fitted pipeline as a .pkl file
filename_pkl = "model.pkl"
pickle.dump(pipeline, open(filename_pkl, "wb"))
print(f"Model saved as {filename_pkl}")

Model saved as model.pkl


In [None]:
# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6437578228217787


In [None]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.63      0.76   2018336
           1       0.22      0.77      0.35    282624

    accuracy                           0.64   2300960
   macro avg       0.59      0.70      0.55   2300960
weighted avg       0.86      0.64      0.70   2300960



In [None]:
cross_val_score(pipeline, X_test, Y_test, cv=3, scoring="accuracy").mean()

0.8771495375920102

In [None]:
import pandas as pd
import numpy as np
import pickle

# Load the trained model
loaded_model = pickle.load(open("model.pkl", "rb"))

# Define the columns expected by the model
column_names = X_train.columns


def generate_submission(test_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(test_file)
    df = pd.DataFrame(df)
    # Replace empty strings with NaN
    df.replace("", np.nan, inplace=True)
    df = pre_process(df)
    # Select the relevant columns
    filtered_df = df[column_names]
    predictions = pipeline.predict(filtered_df)
    # Load the original test file to keep the PassengerId column
    original_df = pd.read_csv(test_file)
    original_df["Target"] = predictions
    # Save the results to a new CSV file
    submission_df = original_df[["id", "Target"]]
    submission_df.to_csv("submission.csv", index=False)
    print("Submission file saved as 'submission.csv'")


# Generate the submission
test_file = "test.csv"
generate_submission(test_file)

Submission file saved as 'submission.csv'


In [None]:
import pandas as pd
from pandas_profiling import ProfileReport


def gen_eda():
    X_train = df.drop(columns=['Response'])
    Y_train = df['Response']
    profile = ProfileReport(
        pd.concat([X_train, Y_train], axis=1),
        title="Pandas Profiling Report",
        explorative=True,
    )
    profile.to_file("pandas_profiling_report.html")


# gen_eda()