## MLZoomcamp Week 3 Logistic Regression

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-09-28 16:21:31--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: 'data.csv'

     0K .......... .......... .......... .......... ..........  3%  233K 6s
    50K .......... .......... .......... .......... ..........  6% 92.5M 3s
   100K .......... .......... .......... .......... .......... 10% 88.7M 2s
   150K .......... .......... .......... .......... .......... 13% 2.67M 1s
   200K .......... .......... .......... .......... .......... 17% 1.33M 1s
   250K .......... .......... .......... .......... .......... 20% 2.19M 1s
   300K .......... .......... .......... .......... .......... 24% 2.80M 1s
   350K ..

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [66]:
df = pd.read_csv('data.csv')

In [67]:
df.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [68]:
df = df[['Make', 'Model', 'Year', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Vehicle Style',
       'highway MPG', 'city mpg', 'MSRP']]

In [69]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [70]:
df = df.fillna(0)

In [71]:
df = df.rename(columns ={'msrp':'price'})

In [72]:
df.columns.isnull()

array([False, False, False, False, False, False, False, False, False,
       False])

#### Question 1

What is the most frequent observation (mode) for the column transmission_type?

    AUTOMATIC
    MANUAL
    AUTOMATED_MANUAL
    DIRECT_DRIVE


In [73]:
df.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

#### Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

    engine_hp and year
    engine_hp and engine_cylinders
    highway_mpg and engine_cylinders
    highway_mpg and city_mpg


In [74]:
df_num = df[['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']]
matrix = df_num.corr()

In [75]:
matrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


In [76]:
df.price.mean()

40594.737032063116

In [77]:
df.shape

(11914, 10)

In [78]:
def above_average(value):
    if value > df.price.mean():
        return 1
    else: return 0   

In [79]:
df['above_average'] = df['price'].map(above_average)


In [80]:
df['above_average']

0        1
1        1
2        0
3        0
4        0
        ..
11909    1
11910    1
11911    1
11912    1
11913    0
Name: above_average, Length: 11914, dtype: int64

In [82]:
del df['price']

KeyError: 'price'

In [83]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [84]:
y_full_train = df_full_train['above_average']
y_test = df_test['above_average']
y_train = df_train['above_average']
y_val = df_val['above_average']

In [33]:
X_full_train = df_full_train[['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']]
X_test = df_test[['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']]
X_train = df_train[['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']]
X_val = df_val[['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']]

In [34]:
len(df_train), len(df_val), len(df_test), len(df_full_train)

(7148, 2383, 2383, 9531)

In [35]:
len(y_train), len(y_val), len(y_test), len(y_full_train)

(7148, 2383, 2383, 9531)

#### Question 3

    Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
    Round the scores to 2 decimals using round(score, 2).

Which of these variables has the lowest mutual information score?

    make
    model
    transmission_type
    vehicle_style


In [36]:
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'above_average'],
      dtype='object')

In [37]:
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [38]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [39]:
def mutual_info_aa_score(series):
    return mutual_info_score(series, y_full_train)

In [40]:
mi = df_full_train[categorical].apply(mutual_info_aa_score)
mi.sort_values(ascending=False)



model                0.460994
make                 0.238724
vehicle_style        0.083390
transmission_type    0.020884
dtype: float64

#### Question 4

    Now let's train a logistic regression.
    Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
    Fit the model on the training dataset.
        To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
        model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

    0.60
    0.72
    0.84
    0.95


In [41]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)



In [42]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [43]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([8.45426579e-04, 9.96341923e-01, 1.49816720e-04, ...,
       2.62411354e-04, 9.89700971e-01, 9.87620006e-01])

In [54]:
a_a = (y_pred >= 0.5).astype(int)
a_a

array([0, 1, 0, ..., 0, 1, 1])

In [45]:
(y_val == a_a).mean()

0.9450272765421738

In [46]:
df_pred = pd.DataFrame()
df_pred['actual'] = y_val
df_pred['probability'] = y_pred
df_pred['prediction'] = a_a.astype(int)
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [47]:
df_pred.correct.mean().round(2)

0.95

1918     False
9951     False
5486     False
292      False
3644     False
         ...  
4385     False
7339     False
9806     False
11162    False
3256     False
Name: above_average, Length: 2383, dtype: bool

#### Question 5

    Let's find the least useful feature using the feature elimination technique.
    Train a model with all these features (using the same parameters as in Q4).
    Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
    For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

    year
    engine_hp
    transmission_type
    city_mpg

    Note: the difference doesn't have to be positive


In [86]:
   
df_train = df_train.drop('year', axis=1, inplace = True)
df_val = df_val.drop('year',axis=1, inplace = True)
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
a_a = (y_pred >= 0.5)
df_pred = pd.DataFrame()
df_pred['actual'] = y_val
df_pred['probability'] = y_pred
df_pred['prediction'] = a_a.astype(int)
df_pred['correct'] = df_pred.prediction == df_pred.actual
print(df_pred.correct.mean().round(2))



AttributeError: 'NoneType' object has no attribute 'drop'

#### Question 6

    For this question, we'll see how to use a linear regression model from Scikit-Learn.
    We'll need to use the original column price. Apply the logarithmic transformation to this column.
    Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
    This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
    Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?

    0
    0.01
    0.1
    1
    10


In [87]:
from sklearn.linear_model import Ridge

In [99]:
df1 = pd.read_csv('data.csv')
df = df1[['Make', 'Model', 'Year', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Vehicle Style',
       'highway MPG', 'city mpg', 'MSRP']]
df.columns = df.columns.str.replace(' ', '_').str.lower()
df = df.fillna(0)
df = df.rename(columns ={'msrp':'price'})

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)

del df_train['price']
del df_val['price']

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

clf = Ridge(alpha=10, solver = 'sag', random_state = 42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_val)

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)
rmse(y_val, y_pred)



0.48703228329751275

In [None]:
0
#0.01
#0.1
#1
10

In [92]:
print ('rmse alpha = 1   ' + str(rmse(y_val, y_pred)))

rmse alpha = 1   0.48681817454327286


In [94]:
print ('rmse alpha = 0.1   ' + str(rmse(y_val, y_pred)))

rmse alpha = 0.1   0.4867967000189975


In [96]:
print ('rmse alpha = 0.01   ' + str(rmse(y_val, y_pred)))

rmse alpha = 0.01   0.4867945519275277


In [98]:
print ('rmse alpha = 0   ' + str(rmse(y_val, y_pred)))

rmse alpha = 0   0.48679431324238753


In [100]:
print ('rmse alpha = 10   ' + str(rmse(y_val, y_pred)))

rmse alpha = 10   0.48703228329751275


In [101]:
min( 0.48681817454327286,0.4867967000189975,0.4867945519275277,0.48679431324238753,0.48703228329751275)

0.48679431324238753