<h2>Dimension Reduction</h2>
A high-dimensional dataset is a dataset that has a great number of columns 
(or variables). Such a dataset presents many mathematical or computational challenges.

The good news is that if variables are correlated — We can transform the variables to a new set of variables without losing much information. this is called dimension reduction.

Principal Component Analysis (PCA) is probably the most popular technique for the dimension reduction. 

In [2]:
import pandas as pd
import numpy as np 
from sklearn import linear_model
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA,TruncatedSVD

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
df_x = pd.DataFrame(boston.data, columns=boston.feature_names)
df_y = pd.DataFrame(boston.target)

In [3]:
reg = linear_model.LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(df_x,df_y,test_size=0.2, random_state=4)
reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [4]:
reg.score(x_test,y_test)

0.7263451459702523

In [5]:
df_x.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [6]:
pca = PCA(n_components=10, whiten='True')
x = pca.fit(df_x).transform(df_x)

In [7]:
pca.explained_variance_

array([3.08899113e+04, 6.25033006e+03, 8.18363958e+02, 2.66684838e+02,
       4.98315275e+01, 2.78767934e+01, 1.60633859e+01, 9.52731770e+00,
       3.27333185e+00, 1.18094181e+00])

In [8]:
reg = linear_model.LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(x,df_y,test_size=0.2, random_state=4)
reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
reg.score(x_test,y_test)

0.6901733717990524

In [10]:
svd = TruncatedSVD(n_components = 10)
x = svd.fit(df_x).transform(df_x)
reg = linear_model.LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(x,df_y,test_size=0.2, random_state=4)
reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
reg.score(x_test,y_test)

0.6822252753655192

In [12]:
df_x.corr()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
CRIM,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621
ZN,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995
INDUS,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038
CHAS,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929
NOX,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879
RM,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808
AGE,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339
DIS,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996
RAD,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676
TAX,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993


['1. Title of Database: Wine recognition data\n',
 '\tUpdated Sept 21, 1998 by C.Blake : Added attribute information\n',
 '\n',
 '2. Sources:\n',
 '   (a) Forina, M. et al, PARVUS - An Extendible Package for Data\n',
 '       Exploration, Classification and Correlation. Institute of Pharmaceutical\n',
 '       and Food Analysis and Technologies, Via Brigata Salerno, \n',
 '       16147 Genoa, Italy.\n',
 '\n',
 '   (b) Stefan Aeberhard, email: stefan@coral.cs.jcu.edu.au\n',
 '   (c) July 1991\n',
 '3. Past Usage:\n',
 '\n',
 '   (1)\n',
 '   S. Aeberhard, D. Coomans and O. de Vel,\n',
 '   Comparison of Classifiers in High Dimensional Settings,\n',
 '   Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of\n',
 '   Mathematics and Statistics, James Cook University of North Queensland.\n',
 '   (Also submitted to Technometrics).\n',
 '\n',
 '   The data was used with many others for comparing various \n',
 '   classifiers. The classes are separable, though only RDA \n',
 

In [6]:


dataset = pd.read_csv("sample_data\")


In [None]:
X = dataset.iloc[:, 0:13]
y = dataset.iloc[:, 13]