# Lab: Linear models in Scikit-Learn

## Linear Regression
Using the diabetes dataset, create a linear regression model and evaluate how well it fits with both mean squared error and mean absolute error

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.datasets import load_diabetes

data = load_diabetes()

In [2]:
model = LinearRegression()
model.fit(data.data, data.target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [3]:
mean_absolute_error(data.target, model.predict(data.data))

43.277395083749866

In [4]:
mean_squared_error(data.target, model.predict(data.data))

2859.6903987680657

Create a Pandas DataFrame with the model coefficients and display them sorted in descending order of absolute value

In [11]:
pd.DataFrame(data.data, columns=data.feature_names).head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
model2 = LinearRegression()
model2.fit(StandardScaler().fit_transform(data.data), data.target)
imp = pd.DataFrame(
    pd.Series(model2.coef_, index=data.feature_names, name='coef')
)
imp['abs_coef'] = imp.coef.abs()
imp.sort_values('abs_coef', ascending=False)

Unnamed: 0,coef,abs_coef
s1,-37.680358,37.680358
s5,35.734713,35.734713
bmi,24.726257,24.726257
s2,22.676487,22.676487
bp,15.429679,15.429679
sex,-11.407031,11.407031
s4,8.422084,8.422084
s3,4.8062,4.8062
s6,3.216612,3.216612
age,-0.476232,0.476232


## Logistic regression

Load the Titanic data set, develop a logistic regression model for survival and evaluate its accuracy 

In [12]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

data = fetch_openml('titanic', version=2)

In [17]:
model = LogisticRegression(solver='liblinear')
model.fit(data.data, data.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
ypred = model.predict(data.data)
accuracy_score(data.target, ypred)

0.7760109041344844

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7676950998185118

In [21]:
# Note: this is just *training error*, so not predictive of real-world performance
accuracy_score(y_train, model.predict(X_train))

0.7787878787878788