#### Importing the modules

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import math as m
from numpy import concatenate as npcon

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#### Getting the data

In [None]:
df = pd.read_csv('./adult-updated.csv')

In [None]:
df

In [None]:
df.describe()

In [None]:
sns.countplot(x='income', data=df)

In [None]:
df["income"].value_counts()

In [None]:
df.drop(['education'], axis=1, inplace=True)

In [None]:
# printing the above information in percentage for a better understanding
print(f"there are {round(24270/32561 * 100, 2)}% persons who have a income <= 50K")
print(f"there are {round(7841/32561 * 100, 2)}% persons who have a income > 50K")

#### Performing feature engineering in-order to extract valulable information from the given raw data

In [None]:
df.isnull().sum()

In [None]:
numerical = [x for x in df.columns if df[x].dtypes != "0"]

In [None]:
for i in numerical:
    print(f"{i} : {len(df[i].unique())}")

In [None]:
df['income'].unique()

In [None]:
numeric_cols = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']

In [None]:
features = ['workclass', 'occupation', 'marital_status', 'occupation',
            'relationship', 'race', 'sex', 'native_country', 'income']


In [None]:
for y in features:
    le = LabelEncoder()
    df[y] = le.fit_transform(df[y])

In [None]:
df.head()

In [None]:
for a in numeric_cols:
    std = StandardScaler()
    df[a] = std.fit_transform(df[[a]]).flatten()

In [None]:
df.head()

#### splitting the data into training and test set

In [None]:
X = df.drop(['income'], axis=1)
y = df['income']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42, stratify=y)

In [None]:
print(np.shape(X_train))
print(np.shape(X_test))
print(np.shape(y_train))
print(np.shape(y_test))

#### Model Selection

#### Logistic Regression

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

In [None]:
classifier.predict(sc.transform(X_train))

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm) # confustion matrix
log_classifier_accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

#### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
classifier.predict(sc.transform(X_train))

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
confusion_matrixdtc = confusion_matrix(y_test, y_pred)
decisiontree_accuracy = accuracy_score(y_test, y_pred)
print(confusion_matrixdtc)
print(classification_report(y_test, y_pred))

#### Random Forest Classifer Model and Report

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
confustion_matrixrfc = confusion_matrix(y_test, y_pred)
print(confusion_matrixdtc)
rtc_classifier_report = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

#### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [None]:
classifier.predict(sc.transform(X_train))

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
confusion_matrixknn = confusion_matrix(y_test, y_pred)
print(confusion_matrixknn)
knn_accuracy_report = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

In [None]:
print(f"Decision Tree Classifier: {decisiontree_accuracy:.4f}")
print(f"Random Forest Classifier: {rtc_classifier_report:.4f}") # for trees = 20
print(f"KNN: {knn_accuracy_report:.3f}")
print(f"LogisticRegression classifier model accuracy report: {log_classifier_accuracy:.3f}")

#### SVC Classifier (with linear kernel)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
x, y = make_classification(n_samples=5000, n_features=10,
                           n_classes=3,
                           n_clusters_per_class=1)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25)

In [None]:
lsvc = LinearSVC(verbose=0)
print(lsvc)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
lsvc.fit(xtrain, ytrain)
score = lsvc.score(xtrain, ytrain)
print("Score: ", score)

In [None]:
cv_scores = cross_val_score(lsvc, xtrain, ytrain, cv=10)
print("CV average score: %.2f" % cv_scores.mean())

In [None]:
ypred = lsvc.predict(xtest)

cm = confusion_matrix(ytest, ypred)
print(cm)

In [None]:
cr = classification_report(ytest, ypred)
print(cr)

### SVC classifier is more accurate for the given dataset with an aggregate of 90% accuracy