In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('mtcars.csv')
df.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [3]:
X = df.hp.values.reshape(-1,1)
y = df.mpg.values.reshape(-1,1)
X.shape, y.shape

((32, 1), (32, 1))

In [4]:
from sklearn.linear_model import LinearRegression

m = LinearRegression().fit(X,y)
m.intercept_, m.coef_

(array([30.09886054]), array([[-0.06822828]]))

In [8]:
def gr_desc(x, y, learn_rate, conv_threshold, batch_size, max_iter):
    converged = False
    iter = 0
    m = batch_size
    t0 = np.random.random(x.shape[1])
    t1 = np.random.random(x.shape[1])
    MSE = (sum([(t0 + t1*x[i] - y[i])**2 for i in range(m)])/ m)
    while not converged:
        grad0 = 1.0/m * sum([(t0 + t1*x[i] - y[i]) for i in range(m)])
        grad1 = 1.0/m * sum([(t0 + t1*x[i] - y[i])*x[i] for i in range(m)])
        temp0 = t0 - learn_rate * grad0
        temp1 = t1 - learn_rate * grad1
        t0 = temp0
        t1 = temp1
        MSE_New = (sum( [ (t0 + t1*x[i] - y[i])**2 for i in range(m)]) / m)
        if abs(MSE - MSE_New ) <= conv_threshold:
            print('Converged, iterations: ', iter)
            converged = True
        MSE = MSE_New
        iter += 1
        if iter == max_iter:
            print('Max interactions reached')
            converged = True
    return t0,t1

In [11]:
def gradient_descent(x, y,learn_rate, conv_threshold,batch_size,max_iter):    
    converged = False
    iter = 0
    m = batch_size 
 
    t0 = np.random.random(x.shape[1])
    t1 = np.random.random(x.shape[1])

    MSE = (sum([(t0 + t1*x[i] - y[i])**2 for i in range(m)])/ m)    

    while not converged:        
        grad0 = 1.0/m * sum([(t0 + t1*x[i] - y[i]) for i in range(m)]) 
        grad1 = 1.0/m * sum([(t0 + t1*x[i] - y[i])*x[i] for i in range(m)])

        temp0 = t0 - learn_rate * grad0
        temp1 = t1 - learn_rate * grad1
    
        t0 = temp0
        t1 = temp1

        MSE_New = (sum( [ (t0 + t1*x[i] - y[i])**2 for i in range(m)] ) / m)

        if abs(MSE - MSE_New ) <= conv_threshold:
            print ('Converged, iterations: ', iter)
            converged = True
    
        MSE = MSE_New   
        iter += 1 
    
        if iter == max_iter:
            print ('Max interactions reached')
            converged = True

    return t0,t1

In [None]:
interc, coef = gradient_descent(x = X,y = y,learn_rate=0.01 ,conv_threshold=1e-5, batch_size=32,max_iter=1500000)
interc, coef

In [None]:
intrercept, coeff = gr_desc(X,y,0.01,1e-5,32,150000)
intrercept, coeff

In [21]:
from sklearn.model_selection import train_test_split              
                        
original_data = pd.read_csv("mtcars.csv")                   
 

def data_split(dat,trf = 0.5,vlf=0.25,tsf = 0.25):
    nrows = dat.shape[0]    
    trnr = int(nrows*trf)
    vlnr = int(nrows*vlf)    
    
    tr_data,rmng = train_test_split(dat,train_size = trnr,random_state=42)
    vl_data, ts_data = train_test_split(rmng,train_size = vlnr,random_state=45)  
    
    return (tr_data,vl_data,ts_data)


train_data, validation_data, test_data = data_split(original_data,trf=0.5,vlf=0.25,tsf=0.25)

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline



input_data = pd.read_csv("ad.csv",header=None)                       

X_columns = set(input_data.columns.values)
y = input_data[len(input_data.columns.values)-1]
X_columns.remove(len(input_data.columns.values)-1)
X = input_data[list(X_columns)]


X_train, X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7,random_state=33)

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])
parameters = {
    'clf__max_depth': (50,100,150),
    'clf__min_samples_split': (2, 3),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)

print ('\n Best score: \n', grid_search.best_score_)
print ('\n Best parameters set: \n')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ('\t%s: %r' % (param_name, best_parameters[param_name]))
print ("\n Confusion Matrix on Test data \n",confusion_matrix(y_test,y_pred))
print ("\n Test Accuracy \n",accuracy_score(y_test,y_pred))
print ("\nPrecision Recall f1 table \n",classification_report(y_test, y_pred))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   16.4s finished



 Best score: 
 0.9673202614379085

 Best parameters set: 

	clf__max_depth: 50
	clf__min_samples_leaf: 1
	clf__min_samples_split: 2

 Confusion Matrix on Test data 
 [[814  19]
 [ 17 134]]

 Test Accuracy 
 0.9634146341463414

Precision Recall f1 table 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       833
           1       0.88      0.89      0.88       151

    accuracy                           0.96       984
   macro avg       0.93      0.93      0.93       984
weighted avg       0.96      0.96      0.96       984

