<a href="https://colab.research.google.com/github/shreyas-desai-stevens/KDDM-project/blob/main/KDDM_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries

In [None]:
import  sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# EDA

## Reading Data

In [None]:
df = pd.read_csv('/content/drive/Shareddrives/KDDM_Project/adult.csv',names = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-num', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native_country', 'Target'])

FileNotFoundError: ignored

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

Dropping because it is same as label encoding the education column

In [None]:
df.drop(['Education-num','Capital-gain','Capital-loss'],axis=1,inplace=True)

## Data Exploration for each column

In [None]:
for column in df.columns:
  print(f"{column} : {df[column].nunique()} ")
  print(f"Unique Values:{df[column].unique()}")
  print()

In [None]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

## Dropping Nulls

In [None]:
df.replace('?', np.nan, inplace=True)

In [None]:
null_values = df.isnull().sum()
print(null_values)

In [None]:
df_dropped = df.copy(deep=True)

## Label Encoding Values

In [None]:
label_encoder = LabelEncoder()
for column in df.columns:
      df[column] = label_encoder.fit_transform(df[column])
      print("Class Labels:", label_encoder.classes_)
      print("Encoded Values:", df[column])

In [None]:
df.head()

In [None]:
X = df.drop(['Target'],axis=1)
y = df['Target']

In [None]:
df.dtypes

# Visualizing Data

## Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_)
feature_importances=pd.Series(model.feature_importances_,index=X.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title("Feature Importance")
plt.show()

In [None]:
df_dropped.dropna(inplace=True)

In [None]:
df_dropped.shape

In [None]:
df_dropped.head()

In [None]:
label_encoder = LabelEncoder()
df_dropped['Target'] = label_encoder.fit_transform(df_dropped['Target'])

## Distribution of Attributes with respect to Target

In [None]:
# Hours per week by target
hours_per_week_by_target = df_dropped.groupby("Target")["Hours-per-week"].mean()

# Create a bar plot to visualize the distribution
hours_per_week_by_target.plot(kind="bar")
plt.xlabel("Target")
plt.ylabel("Average Hours per Week")
plt.title("Average Hours per Week by Pay")
plt.show()

In [None]:
gender = df_dropped['Sex'].value_counts()
gender.plot(kind='bar')
plt.title("Gender Distribution")

In [None]:
# for column in df_dropped.columns:
sns.countplot(df_dropped[df_dropped['Target']==0],x='Race')
plt.title("Distribution of Race for Target <= 50k")
  # plt.show()

In [None]:
sns.countplot(df_dropped[df_dropped['Target']==1],x='Race')
plt.title("Distribution of Race for Target <= 50k")

In [None]:
# Group the data by race and sex, and calculate the count of each target value
race_sex_target_counts = df_dropped.groupby(["Race"])["Target"].value_counts().unstack()

# Create a bar plot to visualize the distribution
race_sex_target_counts.plot(kind="bar")
plt.xlabel("Race")
plt.ylabel("Count")
plt.title("Distribution of Race according to Pay")
plt.subplots_adjust(bottom=0.25)  # Adjust the bottom margin to prevent overlapping labels
plt.show()

In [None]:
# Group the data by race and sex, and calculate the count of each target value
race_sex_target_counts = df_dropped.groupby(["Sex"])["Target"].value_counts().unstack()

# Create a bar plot to visualize the distribution
race_sex_target_counts.plot(kind="bar")
plt.xlabel("Sex")
plt.ylabel("Count")
plt.title("Distribution of Sex according to Pay")
plt.subplots_adjust(bottom=0.25)  # Adjust the bottom margin to prevent overlapping labels
plt.show()

In [None]:
sns.boxplot(x='Target',y='Age',hue='Sex',data=df_dropped)
plt.title("Box Plot Distributions of Gender by Age for Target <= 50k and Target >= 50k")

In [None]:
sns.boxplot(x='Target',y='Age',hue='Race',data=df_dropped)
plt.title("Box Plot Distribution of Race by age for Target <= 50k and Target >= 50k")

In [None]:
plt.figure(figsize=(10, 8))
sns.boxplot(x='Target',y='Age',hue='Workclass',data=df_dropped)
plt.title("Box Plot Distribution of Workclass for Target <= 50k and Target >= 50k")

In [None]:
sns.violinplot(x='Target',y='Age',data=df_dropped)
plt.title("Distribution of Age for Target <= 50k and Target >= 50k")

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x='Sex',y='Age',hue='Target',data=df_dropped)
plt.title("Distribution of Age by Gender")

In [None]:
df_dropped.head()

0 cannot be removed from age </br>
workclass is biased towards private employees (verify with boxplot and feature engineering matrix) </br>
normalize the values in original df for final_weight </br>
Most of the people are Highschool grads or have a bachleors degree </br>
Lot of divorceses in data

In [None]:
label_encoder = LabelEncoder()
for column in ['Workclass','Education','Marital-Status','Occupation','Relationship','Race','Sex','Native_country']:
      df_dropped[column] = label_encoder.fit_transform(df_dropped[column])
      print(label_encoder.classes_)

# Model Training

## Splitting Data

In [None]:
X = df_dropped.drop(['Target'],axis=1)
y = df_dropped['Target']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,shuffle=True)

## K-Nearest Neighbors

In [None]:
for i in [1,3,5,7,9]:
  model = KNeighborsClassifier(n_neighbors=i)
  model.fit(X_train,y_train)
  y_preds = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_preds)
  print("Accuracy: ",accuracy)
  print("Classification Report:")
  print(classification_report(y_test, y_preds))

## Gaussian Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
Y_pred = nb.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, Y_pred)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, Y_pred)}\n")
print(f"Classification Report:\n {classification_report(y_test, Y_pred)}\n")

## Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
Y_pred = rf.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, Y_pred)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, Y_pred)}\n")
print(f"Classification Report:\n {classification_report(y_test, Y_pred)}\n")

## XGBoost

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
Y_pred = xgb.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, Y_pred)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, Y_pred)}\n")
print(f"Classification Report:\n {classification_report(y_test, Y_pred)}\n")

# Hyper-Parameter Optimization

In [None]:
# Create a KNN classifier
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
gnb = GaussianNB()
xgb = XGBClassifier()
# Define the hyperparameter grid
models = [knn,gnb,rf,xgb]

param_grid = {
    'knn':{
      'n_neighbors': [1, 3, 5, 7, 9, 11],
      'weights': ['uniform', 'distance'],
      'metric': ['euclidean', 'manhattan']
    },
    'gnb':{
      'priors': [None]
    },
    'rf':{
      'n_estimators': [50, 100, 200],
      'max_depth': [None, 10, 20, 30],
      'min_samples_split': [2, 5, 10],
      'min_samples_leaf': [1, 2, 4],
    },
    'xgb':{
      'learning_rate': [0.01, 0.1, 0.2],
      'n_estimators': [50, 100, 200],
      'max_depth': [3, 5, 7],
      'min_child_weight': [1, 3, 5],
      'subsample': [0.8, 0.9, 1.0],
      'colsample_bytree': [0.8, 0.9, 1.0],
      'gamma': [0, 0.1, 0.2],
      'reg_alpha': [0, 0.1, 0.5],
      'reg_lambda': [0, 0.1, 0.5]
  }
}

for model_name, model in zip(param_grid.keys(), models):
    print(f"Performing Grid Search for {model_name}")

    # Access the hyperparameter grid for the specific model
    hyperparameter_grid = param_grid[model_name]

    # Create GridSearchCV instance for the current model
    grid_search = GridSearchCV(model, hyperparameter_grid, scoring='accuracy', cv=5)

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and estimator for each model
    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best Estimator for {model_name}: {grid_search.best_estimator_}")
    print()


Performing Grid Search for knn
Best Parameters for knn: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
Best Estimator for knn: KNeighborsClassifier(metric='manhattan', n_neighbors=11)

Performing Grid Search for gnb
Best Parameters for gnb: {'priors': None}
Best Estimator for gnb: GaussianNB()

Performing Grid Search for rf


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_