## Decision Tree Classification

In [None]:
import pandas as pd
df = pd.read_csv("salaries.csv")
df.head()

inputs = df.drop('salary_more_then_100k',axis='columns')# selecting input variables(predictors)
target = df['salary_more_then_100k'] # creating target variable (response) 

# since three predictors are characters in nature, we will convert them in numeric values using labelEncoder
from sklearn.preprocessing import LabelEncoder
le_company = LabelEncoder()
le_job = LabelEncoder()
le_degree = LabelEncoder()

# now creating more columns in the input dataframe
inputs['company_n'] = le_company.fit_transform(inputs['company'])
inputs['job_n'] = le_company.fit_transform(inputs['job'])
inputs['degree_n'] = le_company.fit_transform(inputs['degree'])

# now drop the label columns
inputs_n = inputs.drop(['company','job','degree'],axis='columns')


#### Decision tree library

In [None]:
from sklearn import tree
model = tree.DecisionTreeClassifier()

# training of model, but we can train the modelby diving the data into train and test set
model.fit(inputs_n, target)

model.score(inputs_n,target) # to check the accuracy
model.predict([[2,1,0]])

## Support Vector Machine

SVM is a classification algorithm

In [None]:
iris.feature_names # to get the column names if iris dataset

# creating a dataframe using iris.data where column names are iris feature names
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df.head()

# getting the rows for which target is 1
df[df.target==1].head()

# creating a new column in the dataframe using a different column
df['flower_name'] =df.target.apply(lambda x: iris.target_names[x])
df.head()

# creating three different data frames for every target
df0 = df[df.target==0]
df1 = df[df.target==1]
df2 = df[df.target==2]

# data visualization
import matplotlib.pyplot as plt
%matplotlib inline

# creating scatter plots
Sepal length vs Sepal Width (Setosa vs Versicolor)

plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.scatter(df0['sepal length (cm)'], df0['sepal width (cm)'],color="green",marker='+')
plt.scatter(df1['sepal length (cm)'], df1['sepal width (cm)'],color="blue",marker='.')

Petal length vs Pepal Width (Setosa vs Versicolor)
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.scatter(df0['petal length (cm)'], df0['petal width (cm)'],color="green",marker='+')
plt.scatter(df1['petal length (cm)'], df1['petal width (cm)'],color="blue",marker='.')

####  Train Using Support Vector Machine (SVM)

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['target','flower_name'], axis='columns')
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
len(X_train)
len(X_test)


# importing SVM library
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)
model.score(X_test, y_test)
model.predict([[4.8,3.0,1.5,0.3]])

#### Tune parameters

##### 1. Regularization (C)

In [None]:
model_C = SVC(C=1)
model_C.fit(X_train, y_train)
model_C.score(X_test, y_test)

model_C = SVC(C=10)
model_C.fit(X_train, y_train)
model_C.score(X_test, y_test)

##### 2. Gamma

In [None]:
model_g = SVC(gamma=10)
model_g.fit(X_train, y_train)
model_g.score(X_test, y_test)

##### 3. Kernel

In [None]:
model_linear_kernal = SVC(kernel='linear')
model_linear_kernal.fit(X_train, y_train)

model_linear_kernal.score(X_test, y_test)

## Random Forest 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=20)
model.fit(X_train, y_train)

model.score(X_test, y_test)
y_predicted = model.predict(X_test)

#confusion matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted)
cm
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sn
plt.figure(figsize=(10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')