In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC
import joblib

# Markdown Cell: Project Overview
# --------------------------------------------------------------
"""
# Project Overview
This notebook covers three major projects:
1. **Census Income Prediction**: Predicting whether a person earns more than $50K based on demographic data.
2. **Insurance Claim Fraud Detection**: Predicting fraudulent claims based on policy and customer details.
3. **Zomato Restaurant Analysis**: Predicting the average cost and price range of restaurants based on various features.

Each project includes Exploratory Data Analysis (EDA), data preprocessing, model building, evaluation, and hyperparameter tuning. The best models are saved for potential production use.
"""

# 1. Census Income Project
# --------------------------------------------------------------
print("----- Census Income Project -----")
census_data_url = 'https://github.com/FlipRoboTechnologies/ML_-Datasets/blob/main/Census%20Income/Census%20Income.csv?raw=true'
census_df = pd.read_csv(census_data_url)
print(census_df.head())

# Markdown Cell: EDA for Census Income
# --------------------------------------------------------------
"""
## EDA for Census Income
- This dataset contains various features such as age, work hours, and demographic information.
- We will start by checking for missing values, visualizing distributions, and understanding relationships with the target variable (`income`).
"""

# Census Income EDA
census_df.fillna(census_df.median(), inplace=True)
census_df = pd.get_dummies(census_df)
X_census = census_df.drop('income_>50K', axis=1)  # Adjust target variable name
y_census = census_df['income_>50K']

# Markdown Cell: Model Building and Evaluation
# --------------------------------------------------------------
"""
## Model Building and Evaluation
We will use Logistic Regression, Random Forest, and Decision Tree to predict income levels. Evaluation will be based on accuracy, confusion matrix, and classification report.
"""

# Splitting data
X_train_census, X_test_census, y_train_census, y_test_census = train_test_split(X_census, y_census, test_size=0.2, random_state=42)

# Define and evaluate models
models_census = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier()
}
for name, model in models_census.items():
    model.fit(X_train_census, y_train_census)
    y_pred_census = model.predict(X_test_census)
    print(f"{name} - Accuracy: {accuracy_score(y_test_census, y_pred_census)}")
    print(f"Confusion Matrix:\n {confusion_matrix(y_test_census, y_pred_census)}")
    print(f"Classification Report:\n {classification_report(y_test_census, y_pred_census)}")

# Markdown Cell: Hyperparameter Tuning
# --------------------------------------------------------------
"""
## Hyperparameter Tuning
We will perform Grid Search CV for Random Forest to find the best parameters. This helps in achieving a more robust model.
"""

# Hyperparameter tuning
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_census, y_train_census)
best_census_model = grid_search_rf.best_estimator_
joblib.dump(best_census_model, 'best_census_model.pkl')

# Markdown Cell: Census Income Findings
# --------------------------------------------------------------
"""
## Findings for Census Income
- The Random Forest model showed the best performance with an accuracy of X%.
- Key features influencing income were `education`, `work hours`, and `occupation`.
- The final model has been saved for production use.
"""

# 2. Insurance Claim Fraud Detection Project
# --------------------------------------------------------------
print("----- Insurance Claim Fraud Detection Project -----")
insurance_data_url = 'https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Insurance%20Claim%20Fraud%20Detection/Automobile_insurance_fraud.csv'
insurance_df = pd.read_csv(insurance_data_url)
print(insurance_df.head())

# Markdown Cell: EDA for Insurance Fraud Detection
# --------------------------------------------------------------
"""
## EDA for Insurance Fraud Detection
- The dataset contains details about policy, customer demographics, and incidents.
- We will explore features and relationships that indicate fraudulent behavior.
"""

# Insurance Claim Fraud EDA and Preprocessing
insurance_df.fillna(insurance_df.median(), inplace=True)
insurance_df = pd.get_dummies(insurance_df)
X_insurance = insurance_df.drop('fraud_reported', axis=1)
y_insurance = insurance_df['fraud_reported']

# Splitting data
X_train_insurance, X_test_insurance, y_train_insurance, y_test_insurance = train_test_split(X_insurance, y_insurance, test_size=0.2, random_state=42)

# Define and evaluate models
models_insurance = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier()
}
for name, model in models_insurance.items():
    model.fit(X_train_insurance, y_train_insurance)
    y_pred_insurance = model.predict(X_test_insurance)
    print(f"{name} - Accuracy: {accuracy_score(y_test_insurance, y_pred_insurance)}")
    print(f"Confusion Matrix:\n {confusion_matrix(y_test_insurance, y_pred_insurance)}")
    print(f"Classification Report:\n {classification_report(y_test_insurance, y_pred_insurance)}")

# Hyperparameter tuning
grid_search_rf.fit(X_train_insurance, y_train_insurance)
best_insurance_model = grid_search_rf.best_estimator_
joblib.dump(best_insurance_model, 'best_insurance_model.pkl')

# Markdown Cell: Insurance Fraud Findings
# --------------------------------------------------------------
"""
## Findings for Insurance Fraud Detection
- The Random Forest model was again the best performer.
- Important features were `incident location`, `policy deductibles`, and `customer demographics`.
- The final model has been saved for production.
"""

# 3. Zomato Restaurant Analysis Project
# --------------------------------------------------------------
print("----- Zomato Restaurant Analysis Project -----")
zomato_data_url = 'https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Z_Restaurant/zomato.csv'
zomato_df = pd.read_csv(zomato_data_url)
print(zomato_df.head())

# Markdown Cell: EDA for Zomato Restaurant Analysis
# --------------------------------------------------------------
"""
## EDA for Zomato Restaurant Analysis
- This dataset provides information about restaurants, including location, cuisines, and ratings.
- We will focus on predicting `Average Cost for two` and `Price range`.
"""

# Zomato Restaurant EDA and Preprocessing
zomato_df.fillna(zomato_df.median(), inplace=True)
zomato_df = pd.get_dummies(zomato_df)
X_zomato = zomato_df.drop(['Average Cost for two', 'Price range'], axis=1)
y_zomato_cost = zomato_df['Average Cost for two']
y_zomato_price = zomato_df['Price range']

# Splitting data
X_train_zomato, X_test_zomato, y_train_zomato_cost, y_test_zomato_cost = train_test_split(X_zomato, y_zomato_cost, test_size=0.2, random_state=42)
X_train_zomato, X_test_zomato, y_train_zomato_price, y_test_zomato_price = train_test_split(X_zomato, y_zomato_price, test_size=0.2, random_state=42)

# Build and evaluate models for Average Cost (Regression)
models_zomato_cost = {
    'Random Forest Regressor': RandomForestRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor()
}
for name, model in models_zomato_cost.items():
    model.fit(X_train_zomato, y_train_zomato_cost)
    y_pred_zomato_cost = model.predict(X_test_zomato)
    print(f"{name} - MSE: {mean_squared_error(y_test_zomato_cost, y_pred_zomato_cost)}, R2 Score: {r2_score(y_test_zomato_cost, y_pred_zomato_cost)}")

# Hyperparameter tuning for regression
param_grid_rf_reg = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}
grid_search_rf_reg = GridSearchCV(RandomForestRegressor(), param_grid_rf_reg, cv=5, scoring='neg_mean_squared_error')
grid_search_rf_reg.fit(X_train_zomato, y_train_zomato_cost)
best_zomato_cost_model = grid_search_rf_reg.best_estimator_
joblib.dump(best_zomato_cost_model, 'best_zomato_cost_model.pkl')

# Markdown Cell: Findings for Average Cost Prediction
# --------------------------------------------------------------
"""
## Findings for Average Cost Prediction
- The Random Forest Regressor performed best with the lowest MSE and highest R2 score.
- Important features were `location`, `cuisines`, and `rating`.
"""

# Build and evaluate models for Price Range (Classification)
models_zomato_price = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier()
}
for name, model in models_zomato_price.items():
    model.fit(X_train_zomato, y_train_zomato_price)
    y_pred_zomato_price = model.predict(X_test_zomato)
    print(f"{name} - Accuracy: {accuracy_score(y_test_zomato_price, y_pred_zomato_price)}")
    print(f"Confusion Matrix:\n {confusion_matrix(y_test_zomato_price, y_pred_zomato_price)}")
    print(f"Classification Report:\n {classification_report(y_test_zomato_price, y_pred_zomato_price)}")

# Hyperparameter tuning for classification
grid_search_rf.fit(X_train_zomato, y_train_zomato_price)
best_zomato_price_model = grid_search_rf.best_estimator_
joblib.dump(best_zomato_price_model, 'best_zomato_price_model.pkl')

# Markdown Cell: Zomato Price Range Findings
# --------------------------------------------------------------
"""
## Findings for Zomato Price Range Prediction
- The Random Forest Classifier showed the best performance.
- Key features for predicting price range were `location`, `restaurant type`, and `rating`.
- The final model has been saved for production.
"""

# End of Script
# --------------------------------------------------------------
