In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
AdaBoost = AdaBoostClassifier()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
sns.set_style("darkgrid")

In [None]:
df = pd.read_csv("/kaggle/input/adult-dataset/adult_dataset.csv")

In [None]:
df.head(7)

In [None]:
df.columns = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
             'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'salary']

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
df.workclass.value_counts()

In [None]:
df.native_country.value_counts()

In [None]:
df.salary.value_counts()

In [None]:
df.sex.value_counts()

In [None]:
df.drop("education", axis=1, inplace=True)

In [None]:
lworkclass = LabelEncoder()
lmaritalstatus = LabelEncoder()
loccupation = LabelEncoder()
lrelationship = LabelEncoder()
lrace = LabelEncoder()
lsex = LabelEncoder()
lnativecountry = LabelEncoder()
lsalary = LabelEncoder()

In [None]:
df["Workclass"] = lworkclass.fit_transform(df.workclass)
df["MaritalStatus"] = lmaritalstatus.fit_transform(df.marital_status)

df["Occupation"] = loccupation.fit_transform(df.occupation)
df["Relationship"] = lrelationship.fit_transform(df.relationship)

df["Race"] = lrace.fit_transform(df.race)
df["Sex"] = lsex.fit_transform(df.sex)

df["NativeCountry"] = lnativecountry.fit_transform(df.native_country)
df["Salary"] = lsalary.fit_transform(df.salary)

In [None]:
df.drop(['workclass','marital_status','occupation', 'relationship',
         'race', 'sex','native_country', 'salary'], axis=1, inplace=True)

In [None]:
df.describe().T

In [None]:
sns.distplot(df.hours_per_week)

In [None]:
sns.boxplot(df.hours_per_week)

In [None]:
def outlier_hours_per_week(df):
    IQR = df['hours_per_week'].quantile(0.75) - df['hours_per_week'].quantile(0.25)
    
    lower_range = df['hours_per_week'].quantile(0.25) - (1.5 * IQR)
    upper_range = df['hours_per_week'].quantile(0.75) + (1.5 * IQR)
    
    df.loc[df['hours_per_week'] <= lower_range, 'hours_per_week'] = lower_range
    df.loc[df['hours_per_week'] >= upper_range, 'hours_per_week'] = upper_range
    
outlier_hours_per_week(df)

In [None]:
sns.boxplot(df.hours_per_week)

In [None]:
sns.boxplot(df.education_num)

In [None]:
def outlier_education_num(df):
    IQR = df['education_num'].quantile(0.75) - df['education_num'].quantile(0.25)
    
    lower_range = df['education_num'].quantile(0.25) - (1.5 * IQR)
    upper_range = df['education_num'].quantile(0.75) + (1.5 * IQR)
    
    df.loc[df['education_num'] <= lower_range, 'education_num'] = lower_range
    df.loc[df['education_num'] >= upper_range, 'education_num'] = upper_range
    
outlier_education_num(df)

In [None]:
sns.boxplot(df.education_num)

In [None]:
sns.boxplot(df['capital_loss'])

In [None]:
def outlier_capital_loss(df):
    IQR = df['capital_loss'].quantile(0.75) - df['capital_loss'].quantile(0.25)
    
    lower_range = df['capital_loss'].quantile(0.25) - (1.5 * IQR)
    upper_range = df['capital_loss'].quantile(0.75) + (1.5 * IQR)
    
    df.loc[df['capital_loss'] <= lower_range, 'capital_loss'] = lower_range
    df.loc[df['capital_loss'] >= upper_range, 'capital_loss'] = upper_range

In [None]:
outlier_capital_loss(df)

In [None]:
sns.boxplot(df['capital_loss'])

In [None]:
def outlier_fnlwgt(df):
    IQR = df['fnlwgt'].quantile(0.75) - df['fnlwgt'].quantile(0.25)
    
    lower_range = df['fnlwgt'].quantile(0.25) - (1.5 * IQR)
    upper_range = df['fnlwgt'].quantile(0.75) + (1.5 * IQR)
    
    df.loc[df['fnlwgt'] <= lower_range, 'fnlwgt'] = lower_range
    df.loc[df['fnlwgt'] >= upper_range, 'fnlwgt'] = upper_range

In [None]:
outlier_fnlwgt(df)

In [None]:
sns.boxplot(df.fnlwgt)

In [None]:
df.fnlwgt = df.fnlwgt.apply(round)
df.education_num = df.education_num.apply(round)
df.capital_gain=df.capital_gain.apply(round)
df.capital_loss = df.capital_loss.apply(round)
df.hours_per_week = df.hours_per_week.apply(round)

In [None]:
x = df.drop("Salary", axis=1)
y = df.Salary

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(df.corr(), annot=True)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
sc=StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train.shape, x_test.shape

In [None]:
AdaBoost = AdaBoostClassifier()

In [None]:
AdaBoost.fit(x_train,y_train)

In [None]:
yPred = AdaBoost.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [None]:
accuracy_score(y_test,yPred)

In [None]:
confusion_matrix(y_test,yPred)

In [None]:
cr = classification_report(y_test, yPred)

In [None]:
print(cr)

In [None]:
result = {
    "Actual" : y_test,
    "Predicted" : yPred
}

In [None]:
Predicted = pd.DataFrame(result)

In [None]:
Predicted