In [5]:
#linear regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
class LinearRegression:
    def fit(self,X,y):
        m = X.shape[0]
        X_mean, y_mean = np.mean(X), np.mean(y)
        X_mean_diff, y_mean_diff = X-X_mean, y-y_mean
        self.b1 = (X_mean_diff @ y_mean_diff) / (X_mean_diff @ X_mean_diff)
        self.b0 = y_mean - (self.b1 * X_mean)
        print(f"(b0,b1):({self.b0:.3f},{self.b1:.3f})")
        return self
    
    def predict(self,X):
        return self.b0 + X*self.b1

    def evaluate(self,X,y):
        y_pred = self.predict(X)
        y_diff,y_mean_diff  = y-y_pred , y-np.mean(y)
        rmse = np.sqrt(y_diff @ y_diff/X.shape[0])
        ss_tot = y_mean_diff @ y_mean_diff
        ss_res = y_diff @ y_diff
        r2 = 1 - ss_res/ss_tot
        print("Root mean squared Error:",rmse)
        print("R^2 value:",r2)

In [7]:
def regression_plot(X,y,model,title=""):
    plt.figure(figsize=(14,7))
    plt.title(title)
    plt.xlabel("Head Size(cm^3)")
    plt.ylabel("Brain Weights(grams)")
    
    x_line = np.array([np.min(X) - 100,np.max(X) + 100]).reshape(-1,1)
    y_line = model.predict(x_line)
    
    plt.scatter(X, y,c='orange', label='Original Data Points')
    plt.plot(x_line, y_line,linewidth=4, label='Regression Line')
    plt.legend()

In [18]:
data = pd.read_csv('/headbrain.csv')
print("size:",data.size,"; shape",data.shape)
data.head()

size: 948 ; shape (237, 4)


Unnamed: 0,Gender,Age Range,Head Size(cm^3),Brain Weight(grams)
0,1,1,4512,1530
1,1,1,3738,1297
2,1,1,4261,1335
3,1,1,3777,1282
4,1,1,4177,1590


In [None]:
X = data['Head Size(cm^3)'].values
y = data['Brain Weight(grams)'].values

In [None]:
lin_reg_model= LinearRegression()
lin_reg_model.fit(X,y)
regression_plot(X,y,lin_reg_model,title="Regression Using Oridinary Least Squares Method")
lin_reg_model.evaluate(X,y)

In [None]:
from sklearn.linear_model import LinearRegression as SkLinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# sklearn requires 2d array for X values 
# thus we reshape X to X1 as follows
X1 = X.reshape(-1,1)

In [None]:
sk_lin_reg_model = SkLinearRegression().fit(X1, y)

regression_plot(X1,y,sk_lin_reg_model,title="Linear Regression Using Scikit Learn")

y_hat = sk_lin_reg_model.predict(X1)
rmse = np.sqrt(mean_squared_error(y, y_hat))
r2_score = sk_lin_reg_model.score(X1, y)
print("Root Mean Squared Error:",rmse)
print("R^2 value:",r2_score)

## b) Logistic Regression

### Program 1 - Preprocessing and implementing Logistic Regression on titanic dataset using Scikit Learn

#### AIM:
To preprocess and implement Logistic Regression in titanic dataset using Scikit Learn

#### Part 1 - Importing modules and Loading the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

titanic_df = pd.read_csv('./datasets/titanic.csv')
titanic_df.head()

#### Part 2 - Visualizing the dataset

In [None]:
def plot(title,plot_func,*args,**kwargs):
    plt.title(title)
    plot_func(*args,**kwargs)
    plt.show()

plot('Visualize Missing Data',
   sns.heatmap,titanic_df.isnull(), cbar=False)
plot("Survival vs Sex",
    sns.countplot,x='Survived', hue='Sex', data=titanic_df)
plot("Survival vs Passenger Class",
    sns.countplot,x='Survived', hue='Pclass', data=titanic_df)
plot("Histogram of Passengers Age",
    plt.hist,titanic_df["Age"].dropna())
plot("Histogram of ship Fare",
    plt.hist,titanic_df['Fare'])
plot("Passenger class vs Age",
    sns.boxplot,x='Pclass', y='Age',data=titanic_df)

#### Part 3 - Dealing with missing, categorial and irrelevant  data

In [None]:
mean_ages = {
    p_class:titanic_df[titanic_df["Pclass"]==p_class]["Age"].mean()
    for p_class in titanic_df["Pclass"].unique()
}

def impute_missing_age(columns):
    age , p_class = columns
    if pd.isnull(age):
        return mean_ages[p_class]
    return age

titanic_df['Age'] = titanic_df[['Age', 'Pclass']].apply(
    impute_missing_age,axis = 1
)
plot("Missing Passengers Age Data",
    sns.heatmap,titanic_df.isnull(), cbar=False)

In [None]:
titanic_df.drop('Cabin', axis=1, inplace = True)
titanic_df.dropna(inplace = True)
sex_data = pd.get_dummies(titanic_df['Sex'], drop_first = True)
embarked_data = pd.get_dummies(titanic_df['Embarked'], drop_first = True)
titanic_df = pd.concat([titanic_df, sex_data, embarked_data], axis = 1)
titanic_df.drop(
    ['Name', 'PassengerId', 'Ticket', 'Sex', 'Embarked'], 
    axis =1, inplace = True
)
titanic_df.head()

#### Part 4 - Implementing Logistic Regression using Scikit Learn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import minmax_scale

titanic_df[["Age","Fare"]] = minmax_scale(titanic_df[["Age","Fare"]])

X = titanic_df.drop('Survived', axis = 1)
y = titanic_df['Survived']
X_train, X_test, y_train, y_test =(
    train_test_split(X, y, test_size = 0.3)
)
model = LogisticRegression()
model.fit(X_train, y_train)
y_hat = model.predict(X_test)

def print_title(title): print(f"{title:^50}\n{'='*50}")
    
print_title("Classification Report")
print(classification_report(y_test, y_hat))
print_title("Confusion Matrix")
print(confusion_matrix(y_test, y_hat))

#### Part 1 - Defining Class for Logistic Regression

In [None]:
import numpy as np
import pandas as pd

In [None]:

class LogitRegression() :
    def __init__( self, learning_rate, iterations) :
        self.learning_rate = learning_rate
        self.iterations = iterations

    def p(self,X):
        return 1/(1+np.exp(-(X @ self.w)))

    def fit(self, X, y) :
        m,n = X.shape
        X = np.hstack([np.ones((m,1)),X])
        y = y.squeeze()
        self.w = np.zeros(n+1)
        
        for i in range(self.iterations) :
            self.w = self.w - self.learning_rate * ((self.p(X)-y) @ X) 

    def predict(self, X) :
        m = X.shape[0]
        X = np.hstack([np.ones((m,1)),X])
        y_hat = np.where( self.p(X) > 0.5, 1, 0 )
        return y_hat

#### Part 2 - Loading and Processing Dataset

In [None]:
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:

diabetes_df = pd.read_csv( "./datasets/diabetes.csv" )
X = minmax_scale(diabetes_df.iloc[:,:-1].values)
y = diabetes_df.iloc[:,-1:].values.reshape(-1)
X_train, X_test, y_train, y_test =train_test_split(
    X, y, test_size = 1/3, random_state =6
)

In [None]:

models = [
    LogitRegression(learning_rate = .1, iterations = 1000),
    LogisticRegression()
]
for model in models:
    model.fit(X_train,y_train)
    
def compute_accuracy(model,X_test,y_test):
    y_hat = model.predict(X_test)
    return (y_hat==y_test).mean() * 100 

print("Accuracy on test set by our implementation of Logistic Reg model :",
    compute_accuracy(models[0],X_test,y_test)
)
print("Accuracy on test set by sklearn model :",
    compute_accuracy(models[1],X_test,y_test) 
)