In [None]:
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,r2_score,make_scorer
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [9]:
data=pd.read_csv('beer-servings.csv')
X=data.drop(columns=['total_litres_of_pure_alcohol'])
y=data['total_litres_of_pure_alcohol']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
y_test=y_test.fillna(y_test.mean())

In [5]:
num_features=X.select_dtypes(include='number')
cat_features=X.select_dtypes(include='object')
cat_features
num_features

Unnamed: 0.1,Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0,0.0,0.0,0.0
1,1,89.0,132.0,54.0
2,2,25.0,0.0,14.0
3,3,245.0,138.0,312.0
4,4,217.0,57.0,45.0
...,...,...,...,...
188,188,,100.0,3.0
189,189,111.0,2.0,1.0
190,190,6.0,0.0,0.0
191,191,32.0,19.0,4.0


In [6]:
num_col=num_features.columns.tolist()
def outlier_removal(df,num_col):
    for i in num_col:
        q1=df[i].quantile(0.25)
        q3=df[i].quantile(0.75)
        iqr=q3-q1
        lb=q1-(1.5*iqr)
        ub=q3+(1.5*iqr)
        df[i]=df[i].clip(upper=ub)
        df[i]=df[i].clip(lower=lb)
    return df[num_col]
cat_col=cat_features.columns.tolist()

In [19]:
num=Pipeline(steps=[
    ('imp',SimpleImputer(strategy='mean')),
    ('scale',StandardScaler()),
    ('poly',PolynomialFeatures(degree=2,include_bias=False))
])

cat=Pipeline(steps=[
    ('imp',SimpleImputer(strategy='most_frequent')),
    ('enc',OneHotEncoder(handle_unknown='ignore'))
])

prep =ColumnTransformer(transformers=[
    ('num',num,num_col),
    ('cat',cat,cat_col)
])
model=Pipeline(steps=[
    ('pre',prep),
    ('mod',LinearRegression())
])



In [20]:
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
pd.DataFrame(y_pred)
X_train
print(r2_score(y_pred,y_test))
print(mean_squared_error(y_pred,y_test))
model.get_params()


0.8674189118811056
1.7898325779744386


{'memory': None,
 'steps': [('pre', ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('imp', SimpleImputer()),
                                                    ('scale', StandardScaler()),
                                                    ('poly',
                                                     PolynomialFeatures(include_bias=False))]),
                                    ['Unnamed: 0', 'beer_servings',
                                     'spirit_servings', 'wine_servings']),
                                   ('cat',
                                    Pipeline(steps=[('imp',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('enc',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['country', 'continent'])])),
  ('mod', LinearRegression())],
 'transform_i

In [21]:
from sklearn.model_selection import GridSearchCV

grid= GridSearchCV(
    estimator=model,
    param_grid={},
    scoring={'mse':make_scorer(mean_squared_error),'r2':make_scorer(r2_score)},
    refit='r2',
    cv=10,
    n_jobs=-1
)

In [22]:
grid.fit(X_train,y_train)

0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,{}
,scoring,"{'mse': make_scorer(m...hod='predict'), 'r2': make_scorer(r...hod='predict')}"
,n_jobs,-1
,refit,'r2'
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [23]:
y_pred=grid.predict(X_test)
print(r2_score(y_pred,y_test))
print(mean_squared_error(y_pred,y_test))


0.8674189118811056
1.7898325779744386
