## Linear Regression with Catagorical Features

In [2]:
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
% matplotlib inline

In [3]:
class LinearRegression:
        
    def hyp(self):
        return np.dot(self.X, self.theta)
        
    def get_cost(self):
        return sum((self.hyp() - self.y)**2) / (self.m * 2.0)
    
    def fscale(self):
        self.X= self.X / (self.X.max(axis= 0) - self.X.min(axis= 0))
        self.y= self.y / (max(self.y) - min(self.y))
        
    def computeStandard(self):
        from sklearn.linear_model import LinearRegression
        self.linreg= LinearRegression()
        self.linreg.fit(self.X, self.y)

    def makedummy(self, X, col):
        dumm= pd.get_dummies(X[col], prefix=col)
        X= pd.concat([X, dumm], axis=1)
        X= X.drop(col, 1)
        return X
        
        
    def preprocess(self, X, y, alpha):
        (self.m, self.n)= X.shape
        self.columns= X.columns
        self.X= X.values
        self.y= y.values
        self.fscale()
        self.computeStandard()
        self.n+=1
        self.X= np.insert(self.X, 0, 1, axis= 1)
        self.y= self.y[None].transpose()
        self.alpha= alpha
        self.theta= (np.ones(self.n)*10)[None].transpose()
        
    def plot(self):
        plt.title("Contirbution of each feature in the output")
        colors = ['yellowgreen', 'violet',  'gold', 'lightskyblue', 'red', 'green', 'blue', 'lightcoral']
        plt.pie(abs(self.theta), labels=self.columns.insert(0, 'Base'), autopct='%1.1f%%', colors= colors)
        for i in range(1, self.n):
            f= plt.figure()
            ax = f.add_subplot(111)
            yd= (max(self.X[:,i] * self.theta[i]) - min(self.X[:,i] * self.theta[i]))
            yy= self.y* yd
            offset= yd * self.theta[0]
            ax.scatter(self.X[:,i], yy)
            ax.plot([0,1.0], [offset, self.theta[i]+offset], 'g')
            plt.ylabel('Effect on price')
            plt.xlabel('{}'.format(self.columns[i-1]))
        
    def fit(self, X, y, alpha= 0.1, cat=None):
        if cat is not None:
            X= self.makedummy(X, cat)
            #print(X.sample(10))
            #print(X.columns)
        self.preprocess(X, y, alpha)
        pc= math.inf
        change= 1
        iteration=0
        while(change>0.000000001):
            der = (self.hyp() - self.y).transpose()
            der = np.dot(der , self.X) * self.alpha / self.m
            self.theta= self.theta - der.transpose()

            self.cost= self.get_cost()
            change= pc - self.cost
            pc= self.cost
            iteration += 1
            if(iteration % 500 == 0):
                print("{}. Cost: {}, theta: {}".format(iteration, self.cost, self.theta.transpose()))
        print("Optimization finished in {} iterations.".format(iteration))
        print('+'*70, "\nFinal Cost: {}, theta: {}".format(self.cost, self.theta.transpose()))
        print("sklearn: theta: ",self.linreg.intercept_, self.linreg.coef_,"\n")
        
            
    def predict(self, X):
        X= X.values
        X= X / (X.max(axis= 0) - X.min(axis= 0))
        X= np.insert(X, 0, 1, axis= 1)
        return np.dot(X, self.theta)
    
    def mse(self, y_true, y_pred):
        y_true= y_true.values
        y_true= y_true / (max(y_true) - min(y_true))
        y_true= y_true[None].transpose()
        return sum((y_true - y_pred)**2) / y_true.shape[0]
    
    def rmsq(self, y_true, y_pred):
        return np.sqrt(self.mse(y_true, y_pred))

### Soil Content Dataset

* water
* sand
* silt
* clay
* gases
* geoarea
* organic_matter

In [4]:
m = 1800
np.random.seed(42)

water= np.random.randint(low= 0, high= 10, size= m)
sand= np.random.randint(low= 100, high= 1300, size= m)
silt= np.random.randint(low= 10, high= 25, size= m)
clay= np.random.randint(low= 50, high= 100, size= m)
gases= np.random.randint(low= 1, high= 15, size= m)
organic_matter= np.random.randint(low= 12, high= 51, size= m)

geo= ['ice', 'forest', 'city']
geo_values= {'ice':24, 'forest':54, 'city':12}
geoarea= ['ram']*m
for i in range(m):
    geoarea[i]= random.choice(geo)

value= 5 * water + 2.5 * sand + 6.0 * silt + 2.2 * clay + 7.2 * gases + 3.2 * organic_matter 
for i in range(m):
    value[i]+= geo_values.get(geoarea[i])
    
data= pd.DataFrame({'water':water, 'sand':sand, 'silt':silt, 'clay':clay, 'gases':gases, 'geoarea':geoarea, 'organic_matter':organic_matter, 'value':value})
data.head(2)

Unnamed: 0,water,sand,silt,clay,gases,geoarea,organic_matter,value
0,6,594,20,65,12,forest,46,2065.6
1,3,526,19,55,8,ice,20,1710.6


In [5]:
featue_cols= ['water', 'sand', 'silt', 'clay', 'gases', 'geoarea', 'organic_matter']
target_col= 'value'
X= data[featue_cols]
y= data[target_col]
X_train, X_test, y_train, y_test= train_test_split(X, y)

In [6]:
linreg= LinearRegression()
linreg.fit(X_train, y_train, cat='geoarea', alpha= 0.1)

500. Cost: [0.05339352], theta: [[-3.01171397 -0.17094987  0.62770333 -0.52629756 -0.78046458 -0.16878906
  -0.34030722  5.67571088  5.64412462  5.66845053]]
1000. Cost: [0.01038743], theta: [[-4.11579116 -0.08151544  0.77741343 -0.21539384 -0.30179591 -0.07215151
  -0.14466426  5.29529971  5.29202433  5.2968848 ]]
1500. Cost: [0.00203739], theta: [[-4.60447811 -0.02833584  0.86764327 -0.08073709 -0.11446985 -0.0156707
  -0.04305147  5.12892358  5.13484428  5.13175404]]
2000. Cost: [0.00039962], theta: [[-4.82090051e+00 -4.68773141e-03  9.07986233e-01 -2.10808621e-02
  -3.18559138e-02  9.41404638e-03  2.17619914e-03  5.05526266e+00
   5.06522354e+00  5.05861330e+00]]
2500. Cost: [7.83818223e-05], theta: [[-4.91674933e+00  5.78641452e-03  9.25859414e-01  5.34042266e-03
   4.72687706e-03  2.05233490e-02  2.22100250e-02  5.02264008e+00
   5.03438990e+00  5.02622069e+00]]
3000. Cost: [1.5373955e-05], theta: [[-4.95919876  0.01042521  0.93377517  0.01704187  0.02092855  0.02544341
   0.0310

### Training data evaluaiton

In [7]:
y_pred= linreg.predict(linreg.makedummy(X_train, col='geoarea'))
mse= linreg.mse(y_train, y_pred)
print("Training mse:",mse)

Training mse: [6.1256825e-07]


### Testing data evaluation

In [8]:
y_pred= linreg.predict(linreg.makedummy(X_test, col='geoarea'))
mse= linreg.mse(y_test, y_pred)
print("Testing mse:",mse)

Testing mse: [0.00065405]


## DONE