In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('beer.csv')  

# Removing the full names of the beers, Name and Brewery since it won't influence the model
# Removing Description since it is just plain text.
df = df.drop( ['Beer Name (Full)', 'Description','Name','Brewery'], axis = 1)

In [3]:
df.corr()

Unnamed: 0,ABV,Min IBU,Max IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,review_aroma,review_appearance,review_palate,review_taste,review_overall,number_of_reviews
ABV,1.0,0.432005,0.501037,-0.169521,0.241673,0.654908,0.067388,0.463487,0.100795,-0.120089,0.291001,-0.052596,0.191468,0.16206,0.416197,0.349139,0.392517,0.3812,0.251838,0.142176
Min IBU,0.432005,1.0,0.854248,-0.071501,0.325338,0.323694,0.539452,0.227139,-0.073098,-0.057512,0.066335,0.407475,-0.046152,0.300041,0.34373,0.380908,0.352723,0.339885,0.289972,0.219822
Max IBU,0.501037,0.854248,1.0,-0.120273,0.310617,0.392811,0.47808,0.277292,-0.043275,-0.083214,0.172929,0.345168,0.044533,0.288219,0.369456,0.379655,0.355608,0.346557,0.271969,0.221572
Astringency,-0.169521,-0.071501,-0.120273,1.0,-0.05954,-0.171987,0.114686,-0.021456,0.57103,0.347155,0.345232,0.330951,-0.083795,-0.082085,0.068579,0.070948,0.10695,0.092021,0.159788,-0.050062
Body,0.241673,0.325338,0.310617,-0.05954,1.0,0.268885,0.542236,0.458842,-0.126733,-0.099277,-0.048155,0.070138,0.185123,0.754228,0.360234,0.421331,0.34527,0.365055,0.313824,0.039057
Alcohol,0.654908,0.323694,0.392811,-0.171987,0.268885,1.0,0.009088,0.527039,0.048767,-0.094329,0.254299,-0.079949,0.252876,0.270106,0.242989,0.184693,0.205569,0.206701,0.069789,0.048346
Bitter,0.067388,0.539452,0.47808,0.114686,0.542236,0.009088,1.0,0.091705,-0.136914,0.004693,-0.09345,0.712887,-0.084048,0.56557,0.241457,0.331854,0.257267,0.253373,0.260874,0.112026
Sweet,0.463487,0.227139,0.277292,-0.021456,0.458842,0.527039,0.091705,1.0,0.257913,-0.131918,0.48203,-0.034327,0.107548,0.471032,0.356514,0.332311,0.315767,0.322277,0.207293,0.048487
Sour,0.100795,-0.073098,-0.043275,0.57103,-0.126733,0.048767,-0.136914,0.257913,1.0,0.098173,0.785883,0.068895,0.001831,-0.303266,0.28932,0.204817,0.245169,0.244314,0.213449,0.006642
Salty,-0.120089,-0.057512,-0.083214,0.347155,-0.099277,-0.094329,0.004693,-0.131918,0.098173,1.0,0.02692,0.172606,-0.023079,-0.028241,-0.08821,-0.081388,-0.038413,-0.057479,-0.008199,-0.02968


## From the correlation table we can clearly observe a high correlation between review_overall and other review_ related features which indicates the need to remove those columns because of : 
###### 1. No sense to add them to the model because review_overall is  a characterization/ mean value of all review_ features. 
###### 2. Lack of multicollinearity since the  independent variables are highly correlated with each other and with the target column.

In [4]:
#Dropping the review_ columns  
df = df.drop( ['review_aroma', 'review_appearance','review_palate','review_taste'], axis = 1)

In [5]:
# Initializing dummies variables for Style feature (111 Categorical data in total)
df = pd.get_dummies(df, columns=['Style'])

In [6]:
# Getting a copy of series of target column
temp_target = df['review_overall']   

# Removing the target column at the present position
df.pop('review_overall')  

# Adding back as the last column the column target
df.insert(df.shape[1], 'review_overall', temp_target)

In [7]:
# Parsing the correlation table and selecting the features with highest correlations
for i in range (20):
    for j in range(i):
        if abs ( df.corr().values[i][j] ) > 0.6:
            print(f'{df.columns[i]} and {df.columns[j]} : {df.corr().values[i][j]} ')

Max IBU and Min IBU : 0.854248060050787 
Alcohol and ABV : 0.6549081323635012 
Fruits and Sour : 0.7858825416364527 
Hoppy and Bitter : 0.7128867533688273 
Malty and Body : 0.7542281845837208 


##### 1 . The IBU  indices are expected to have the same correlation since they are describing the same 'feature' of the beer. Also, that depends on the type of beer since a beer can have the minimum IBU as another's maximum IBU.   
##### 2. ABV, or alcohol by volume, is the standard measurement, used worldwide, to assess the strength of a particular beer.  So, basically is the same as Alcohol.
#####  3. A strong and strange correlation between Fruits and Sour  is observed. That is actually because, essentially, "sour beer" refers to any beer that tastes especially acidic and lively. By including fruits like raspberry, cherry and peach, sour beers can create the perfect balance of sweet and sour flavors.   
##### 4.   So hoppy just means you can taste and/or smell the characteristics of the hops which can be fruity, earthy, citric, floral, piney, etc based on the type of hop. It does not always mean that the beer is bitter. Eventually, the data sample is composed of bitter beers :) .

##### 5. Malty and Body  is all about Dextrin malt. Dextrin malt is made from malted barley and is a type of crystal malt. It contributes BODY to beer, aids in foam retention and beer stability, and gives the beer additional smoothness and sometimes a sense of sweetness.  

## Getting the > 0.2 correlation sample.

In [8]:
#initializing df's index array with highest correlations
index_cor_arr = list()

# looping untill target column's index since we do not want to include the dummy vars
for i in range (df.shape[1]):
    # each time assigning   val with  column's correlated value to the review_overall's index 
    val =   df.corr().values[ df.shape[1] - 1 ][i]  
    val = abs(val) 
    #comparing the val and storing the column's indexes in a list
    if val > 0.2:
        index_cor_arr.append(i)     

In [9]:
index_cor_arr

[0, 1, 2, 4, 6, 7, 8, 10, 13, 52, 58, 63, 64, 74, 126]

In [10]:
#Initializing a dataframe with correlated values ( corr coef is > 0.2 )  
df2 = pd.DataFrame({  
    
    # The column's names are taken from df and they are being assigned the values from df too, all in a loop.  
    f'{df.columns[ index_cor_arr[i] ] }' :  df[f'{ df.columns[ index_cor_arr[i] ] }'] for i in range ( len(index_cor_arr)  ) 
    
            })

In [11]:
df2.corr()

Unnamed: 0,ABV,Min IBU,Max IBU,Body,Bitter,Sweet,Sour,Fruits,Malty,Style_Lager - Adjunct,Style_Lager - European Strong,Style_Lager - Light,Style_Lager - Malt Liquor,Style_Low Alcohol Beer,review_overall
ABV,1.0,0.432005,0.501037,0.241673,0.067388,0.463487,0.100795,0.291001,0.16206,-0.079691,0.088238,-0.113295,0.047758,-0.239565,0.251838
Min IBU,0.432005,1.0,0.854248,0.325338,0.539452,0.227139,-0.073098,0.066335,0.300041,-0.118946,-0.051875,-0.112055,-0.086239,-0.084909,0.289972
Max IBU,0.501037,0.854248,1.0,0.310617,0.47808,0.277292,-0.043275,0.172929,0.288219,-0.117442,0.005273,-0.142268,-0.042984,-0.089414,0.271969
Body,0.241673,0.325338,0.310617,1.0,0.542236,0.458842,-0.126733,-0.048155,0.754228,-0.107445,-0.083148,-0.106101,-0.068026,-0.096184,0.313824
Bitter,0.067388,0.539452,0.47808,0.542236,1.0,0.091705,-0.136914,-0.09345,0.56557,-0.091682,-0.065881,-0.088128,-0.084251,-0.054305,0.260874
Sweet,0.463487,0.227139,0.277292,0.458842,0.091705,1.0,0.257913,0.48203,0.471032,-0.128066,-0.020913,-0.128473,-0.045952,-0.101637,0.207293
Sour,0.100795,-0.073098,-0.043275,-0.126733,-0.136914,0.257913,1.0,0.785883,-0.303266,-0.082426,-0.047525,-0.065038,-0.050849,-0.065582,0.213449
Fruits,0.291001,0.066335,0.172929,-0.048155,-0.09345,0.48203,0.785883,1.0,-0.19689,-0.114034,-0.065621,-0.087335,-0.074895,-0.08728,0.261504
Malty,0.16206,0.300041,0.288219,0.754228,0.56557,0.471032,-0.303266,-0.19689,1.0,-0.104131,-0.074188,-0.099312,-0.043736,-0.077612,0.21299
Style_Lager - Adjunct,-0.079691,-0.118946,-0.117442,-0.107445,-0.091682,-0.128066,-0.082426,-0.114034,-0.104131,1.0,-0.013278,-0.013449,-0.012203,-0.012014,-0.200449


##### American Adjunct lagers the goal isn't fewer calories but a certain “crisp” flavor profile  ( Bitter flavour =. IBU indexes ) 
#####  This Euro Lager type is typically higher in alcohol than a brewery's standard Lager. ( ABV feature )   
##### Malt liquor is a strong lager or ale in which sugar, corn or other adjuncts are added to the malted barley to boost the total amount of fermentable sugars in the wort.  ==> Basically describes the correlation between Fruits and Sour features.  

# Building the models

In [12]:
# functions that takes a dataset as input and trains a model on LR
def lr_score(df):
    """" 
    X : independent vars 
    y : dependent vars   
    
    """ 
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values 

    # Splitting the data in Training set and Testing's
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

    # Fitting the data in the LR model
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression() 
    lr.fit(X_train, y_train) 

    # Predicting 
    y_pred = lr.predict(X_test)   
    
    # Printing the score
    print(lr.score(X,y)) 
    
    
    from sklearn.metrics import mean_squared_error, mean_absolute_error  
    print(f' Mean Squared err = {mean_squared_error(y_pred, y_test)} \n Mean Abs err = {mean_absolute_error(y_pred, y_test)}')
    

In [13]:
# just the sample with small changes.
lr_score(df) 

0.5392677259249217
 Mean Squared err = 0.1095537721503416 
 Mean Abs err = 0.2454847649246335


In [14]:
# Correlated features
lr_score(df2)

0.4318360971670949
 Mean Squared err = 0.12314799567027009 
 Mean Abs err = 0.2622232910731677


## Conclusion
####  A observation is  that the model has been trained better on non correlated data ( since the features would not be "similiar", we got better results.  
#### I observed a high correlation between features of 2nd sample  ( + corr coef of 0.2 - 0.7 with the target column ) and this might  have worsened the model

In [15]:
#
#
#
#
#
# Some errors de aici, cred ca la meeting.

In [70]:
class Lin_reg(): 
    #Define a Linear Regression class to store out relevant fucntions in
    
    def __init__(self):
        """"
            Initializes the Linear Regression model.
            Also stores the self.params__ variable, which
            will be the weights that the model returns. 
        """ 
        self.params__ = None  
        
    def gradientDescent(self, X, y, learning_rate = 0.00001,
                       iterations = 500, batch_size = 16):
        """"
            This function applies the Gradient Descent model
            onto the dataset
        :param X: numpy.ndarray
            The X matrix containing the independet variable columns.
        :param y: numpy.ndarray
            The target vector y
        """ 
        #Adds a columns of ones for the constant term
        X = np.concatenate([X,np.ones_like(y)], axis = 1)
        rows, cols = X.shape 
        
        #Combine the X and y columns to more easily shuffle it later
        X = np.append(X,y, axis = 1)
        
        #Make the initial random guess for w 
        w = np.random.random((cols,1)) 
        
        #Go through all the iterations
        for i in range( iterantions ):
            #Shuffle the rows of the data
            np.random.shuffle(X)
            #Defines X and y again
            y_it = X[:,-1].reshape((rows,1))
            X_it = X[:,:-1] 
            
            for batch in range(math.ceil(rows / batch_size)):
                batch_start = batch * batch_size
                #Cut a batch from the dataset
                x_batch = X_it[ batch_start : min(batch_start +
                                                batch_size, X.shape[0])] 
                y_batch = y_it[ batch_start : min(batch_start +
                                                 batch_size, X.shape[0])]
                
                #Substract the gradient from our previous estimation
                
                w -= learning_rate * np.matmul(x_batch.transpose(), 
                                              (np.matmul(x_batch, w) - y_batch))
        self.params__ = w 
        return self  
    
    def predict(self,X):
        X = np.concatenate([X, np.ones(X.shape[0]).reshape((X.shape[0], 1))],
                          axis = 1)
        return np.matmul(X, self.params__)
        
        

In [74]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values   

# Te intreb deja la meeting
kek = Lin_reg
kek.gradientDescent(X,y)

TypeError: gradientDescent() missing 1 required positional argument: 'y'