# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Loading dataset

In [2]:
data = pd.read_csv("salaries.csv")

In [3]:
data

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0
5,google,computer programmer,masters,1
6,abc pharma,sales executive,masters,0
7,abc pharma,computer programmer,bachelors,0
8,abc pharma,business manager,bachelors,0
9,abc pharma,business manager,masters,1


# X and y matrix

In [4]:
X = data.drop('salary_more_then_100k',axis=1)
y = data['salary_more_then_100k']

# Data Transformation

## Normalizing non-numerical columns

In [5]:
from sklearn.preprocessing import LabelEncoder
le_company = LabelEncoder()
le_job = LabelEncoder()
le_degree = LabelEncoder()

In [6]:
X['company_n'] = le_company.fit_transform(X['company'])
X['job_n'] = le_job.fit_transform(X['job'])
X['degree_n'] = le_degree.fit_transform(X['degree'])
X

Unnamed: 0,company,job,degree,company_n,job_n,degree_n
0,google,sales executive,bachelors,2,2,0
1,google,sales executive,masters,2,2,1
2,google,business manager,bachelors,2,0,0
3,google,business manager,masters,2,0,1
4,google,computer programmer,bachelors,2,1,0
5,google,computer programmer,masters,2,1,1
6,abc pharma,sales executive,masters,0,2,1
7,abc pharma,computer programmer,bachelors,0,1,0
8,abc pharma,business manager,bachelors,0,0,0
9,abc pharma,business manager,masters,0,0,1


In [7]:
X = X.drop(['company','job','degree'],axis=1)
X

Unnamed: 0,company_n,job_n,degree_n
0,2,2,0
1,2,2,1
2,2,0,0
3,2,0,1
4,2,1,0
5,2,1,1
6,0,2,1
7,0,1,0
8,0,0,0
9,0,0,1


In [8]:
y

0     0
1     0
2     1
3     1
4     0
5     1
6     0
7     0
8     0
9     1
10    1
11    1
12    1
13    1
14    1
15    1
Name: salary_more_then_100k, dtype: int64

# Splitting into train and test datasets

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=43)

# Decision Tree Classification Model

### Instantiating and fitting the model

In [10]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [11]:
dtc.fit(X_train,y_train)

## Evaluating the model

In [12]:
y_pred = dtc.predict(X_test)
from sklearn.metrics import accuracy_score,f1_score
print("Accuracy Score:{:.2f}%".format(accuracy_score(y_test,y_pred)*100))
print("F1 Score:",f1_score(y_test,y_pred))

Accuracy Score:75.00%
F1 Score: 0.8


# Predicting the result

In [13]:
data.iloc[2]

company                            google
job                      business manager
degree                          bachelors
salary_more_then_100k                   1
Name: 2, dtype: object

## Checking if Business Manager in Google has salary more than 100k

In [14]:
X.iloc[2]

company_n    2
job_n        0
degree_n     0
Name: 2, dtype: int32

In [15]:
y.iloc[2]

1

In [16]:
if dtc.predict([[2,0,0]]) == 1:
    print("Salary is more than 100k")
else:
    print("Salary is less than 100k")

Salary is more than 100k
