## Importing liabraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

## Loading Data

In [2]:
train_file_path = 'Out_of_time_data.csv'  
test_file_path = 'claim_data_v1.csv'     

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

## Data Cleaning

In [3]:
def clean_data(data):
    data.columns = data.columns.str.strip().str.replace(" ", "_").str.lower()  # Standardize column names
    return data.dropna()  

train_data = clean_data(train_data)
test_data = clean_data(test_data)

columns_to_drop = ['inspected_or_not', 'random']
train_data = train_data.drop(columns=columns_to_drop, errors='ignore')
test_data = test_data.drop(columns=columns_to_drop, errors='ignore')

## Encoding Categorical Columns

In [4]:
categorical_cols = ['person_gender', 'person_education', 'plan_type', 'previous_fraud']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    
    le.fit(pd.concat([train_data[col], test_data[col]], axis=0))
    train_data[col] = le.transform(train_data[col])
    test_data[col] = test_data[col].apply(lambda x: x if x in le.classes_ else "Unknown")  
    le.classes_ = np.append(le.classes_, "Unknown") 
    test_data[col] = le.transform(test_data[col])
    label_encoders[col] = le

## Splitting the Data

In [5]:
X_train = train_data.drop(columns=['fraud_status'])  # Features from training data
y_train = train_data['fraud_status']                # Target from training data

X_test = test_data.drop(columns=['fraud_status'])   # Features from testing data
y_test = test_data['fraud_status']                 # Target from testing data

## Scalling the features

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Training the Model

In [7]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

## Predicting 

In [17]:
y_pred = rf_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Accuracy: 0.6097070905518842
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.64      0.76     42652
           1       0.00      0.00      0.00      2140

    accuracy                           0.61     44792
   macro avg       0.46      0.32      0.38     44792
weighted avg       0.88      0.61      0.72     44792



In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

train_file_path = 'Out_of_time_data.csv'  
test_file_path = 'claim_data_v1.csv'    

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)


def clean_data(data):
    data.columns = data.columns.str.strip().str.replace(" ", "_").str.lower()  
    return data.dropna() 

train_data = clean_data(train_data)
test_data = clean_data(test_data)


columns_to_drop = ['inspected_or_not', 'random']
train_data = train_data.drop(columns=columns_to_drop, errors='ignore')
test_data = test_data.drop(columns=columns_to_drop, errors='ignore')


from sklearn.preprocessing import LabelEncoder
categorical_cols = ['person_gender', 'person_education', 'plan_type', 'previous_fraud']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
   
    le.fit(pd.concat([train_data[col], test_data[col]], axis=0))
    train_data[col] = le.transform(train_data[col])
    test_data[col] = test_data[col].apply(lambda x: x if x in le.classes_ else "Unknown")  
    le.classes_ = np.append(le.classes_, "Unknown")  
    test_data[col] = le.transform(test_data[col])
    label_encoders[col] = le

X_train = train_data.drop(columns=['fraud_status'])  
y_train = train_data['fraud_status']                

X_test = test_data.drop(columns=['fraud_status'])   
y_test = test_data['fraud_status']                


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


svm_model = SVC(class_weight='balanced', random_state=42)

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
}

grid_search = GridSearchCV(svm_model, param_grid, cv=StratifiedKFold(n_splits=5), scoring='roc_auc', verbose=1)
grid_search.fit(X_train_resampled, y_train_resampled)


best_svm_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")


y_pred = best_svm_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)



cv_scores = cross_val_score(best_svm_model, X_train_resampled, y_train_resampled, cv=5)
print("Cross-validation scores:", cv_scores)
print(f"Average cross-validation score: {cv_scores.mean()}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.718007679942847
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.75      0.84     42652
           1       0.00      0.00      0.00      2140

    accuracy                           0.72     44792
   macro avg       0.47      0.38      0.42     44792
weighted avg       0.89      0.72      0.80     44792

Cross-validation scores: [0.87755102 0.79591837 0.79166667 0.70833333 0.83333333]
Average cross-validation score: 0.8013605442176871



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   - -------------------------------------- 4.5/124.9 MB 26.9 MB/s eta 0:00:05
   --- ------------------------------------ 11.3/124.9 MB 30.6 MB/s eta 0:00:04
   ----- ---------------------------------- 18.6/124.9 MB 31.7 MB/s eta 0:00:04
   -------- ------------------------------- 25.2/124.9 MB 31.2 MB/s eta 0:00:04
   ---------- ----------------------------- 33.6/124.9 MB 33.3 MB/s eta 0:00:03
   --------------- ------------------------ 48.2/124.9 MB 39.4 MB/s eta 0:00:02
   ------------------ --------------------- 56.6/124.9 MB 39.6 MB/s eta 0:00:02
   ------------------- -------------------- 59.8/124.9 MB 36.3 MB/s eta 0:00:02
   --------------------- ------------------ 68.2/124.9 MB 36.5 MB/s eta 0:00:02
   ------------------------- -------------- 79.7/124.9 MB 38.2