In [1]:
import pandas as pd
df = pd.read_csv("salaries.csv")
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [2]:
df.shape

(16, 4)

#### Feature Engineering Label and One Hot Encoding

In [3]:
df.degree.unique()

array(['bachelors', 'masters'], dtype=object)

In [4]:
df['degree_no'] = df['degree'].map({'bachelors':1,'masters':2})
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k,degree_no
0,google,sales executive,bachelors,0,1
1,google,sales executive,masters,0,2
2,google,business manager,bachelors,1,1
3,google,business manager,masters,1,2
4,google,computer programmer,bachelors,0,1


In [5]:
df.drop('degree',axis="columns",inplace = True)
df.head()

Unnamed: 0,company,job,salary_more_then_100k,degree_no
0,google,sales executive,0,1
1,google,sales executive,0,2
2,google,business manager,1,1
3,google,business manager,1,2
4,google,computer programmer,0,1


In [6]:
df_encoded = pd.get_dummies(df,columns=['company','job'],drop_first = True)
df_encoded

Unnamed: 0,salary_more_then_100k,degree_no,company_facebook,company_google,job_computer programmer,job_sales executive
0,0,1,False,True,False,True
1,0,2,False,True,False,True
2,1,1,False,True,False,False
3,1,2,False,True,False,False
4,0,1,False,True,True,False
5,1,2,False,True,True,False
6,0,2,False,False,False,True
7,0,1,False,False,True,False
8,0,1,False,False,False,False
9,1,2,False,False,False,False


#### Model Training using criterian "gini"

In [17]:
from sklearn.tree import DecisionTreeClassifier

X = df_encoded.drop('salary_more_then_100k',axis = "columns")
y = df_encoded['salary_more_then_100k']
model = DecisionTreeClassifier(criterion="gini")
model.fit(X,y)

In [18]:
from sklearn.metrics import classification_report

y_pred = model.predict(X)
report = classification_report(y,y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16



In [19]:
from sklearn.tree import export_text

print(export_text(model,feature_names=list(X.columns)))

|--- company_facebook <= 0.50
|   |--- job_sales executive <= 0.50
|   |   |--- degree_no <= 1.50
|   |   |   |--- job_computer programmer <= 0.50
|   |   |   |   |--- company_google <= 0.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- company_google >  0.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- job_computer programmer >  0.50
|   |   |   |   |--- class: 0
|   |   |--- degree_no >  1.50
|   |   |   |--- class: 1
|   |--- job_sales executive >  0.50
|   |   |--- class: 0
|--- company_facebook >  0.50
|   |--- class: 1



### Model Training using criterian "entropy"


In [20]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion="entropy")
model.fit(X, y)

y_pred = model.predict(X)

from sklearn.metrics import classification_report

report = classification_report(y, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16



In [10]:
from sklearn.tree import export_text
print(export_text(model, feature_names=list(X.columns)))

|--- company_facebook <= 0.50
|   |--- job_sales executive <= 0.50
|   |   |--- degree_no <= 1.50
|   |   |   |--- job_computer programmer <= 0.50
|   |   |   |   |--- company_google <= 0.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- company_google >  0.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- job_computer programmer >  0.50
|   |   |   |   |--- class: 0
|   |   |--- degree_no >  1.50
|   |   |   |--- class: 1
|   |--- job_sales executive >  0.50
|   |   |--- class: 0
|--- company_facebook >  0.50
|   |--- class: 1



In [11]:
X.head(2)

Unnamed: 0,degree_no,company_facebook,company_google,job_computer programmer,job_sales executive
0,1,False,True,False,True
1,2,False,True,False,True


In [14]:
input_data = pd.DataFrame([[1, 0, 1, 1, 0]],columns=X.columns)
pred = model.predict(input_data)
print(pred)

[0]


In [16]:

input_data = pd.DataFrame([[2, 0, 1, 1, 0]],columns=X.columns)
pred = model.predict(input_data)
print(pred)

[1]
