### Find the best Reduction algorithm for the given dataset among PCM, SVD and LDA

In [2]:
# Some Presets
%matplotlib inline

#imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# Make the graphs a bit prettier, and bigger
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

In [3]:
#SkLearn Library Imports 
from sklearn.decomposition import PCA,TruncatedSVD,IncrementalPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [4]:
#Initializers
le=LabelEncoder()
cv = CountVectorizer(stop_words='english',max_features=6000,ngram_range=(1,1))
tf = TfidfVectorizer(stop_words='english',max_features=6000,ngram_range=(1,1))

In [5]:
#set working directory
os.chdir(os.path.join('D:\Python\Jazari_AI_Lessons\DataSets\ML_Assignment_6'))

In [6]:
#import data
data=pd.read_csv(os.path.join('./wine_reviews.csv'))

In [7]:
#checking data 
print("Shape of Data",data.shape)

Shape of Data (129971, 14)


In [8]:
#Drop unwanted columns
data=data.drop('Unnamed: 0',axis=1)

In [9]:
#na values
data.isna().sum()

country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [10]:
#Data Cleanup - Filling up Nas - Straight Forward columns
data['country']=data['country'].fillna('No_country')
data['designation']=data['designation'].fillna('No_designation')              
data['province']=data['province'].fillna('No_province')              
data['region_1']=data['region_1'].fillna('No_region_1')
data['region_2']=data['region_2'].fillna('No_region_2')
data['taster_name']=data['taster_name'].fillna('No_taster_name')
data['taster_twitter_handle']=data['taster_twitter_handle'].fillna('No_tasteer_twitter_handle')
data['variety']=data['variety'].fillna('No_variety')

In [11]:
#Basic Categorical Preprocessing for Straight Forward fields
#Note : Before using Label encoder please fill na values and then start 
data['country_code']=le.fit_transform(data['country'])
data['designation_code']=le.fit_transform(data['designation'])
data['province_code']=le.fit_transform(data['province'])
data['region_1_code']=le.fit_transform(data['region_1'])
data['region_2_code']=le.fit_transform(data['region_2'])
data['taster_name_code']=le.fit_transform(data['taster_name'])
data['taster_twitter_handle_code']=le.fit_transform(data['taster_twitter_handle'])
data['variety_code']=le.fit_transform(data['variety'])

In [12]:
#
data[data['country']=='Argentina'].groupby('country').price.agg('mean')

country
Argentina    24.510117
Name: price, dtype: float64

In [13]:
data[['country','country_code']].head()

Unnamed: 0,country,country_code
0,Italy,22
1,Portugal,32
2,US,41
3,US,41
4,US,41


In [14]:
data_train=data.drop(['country','designation','province','region_1','region_2','taster_name','taster_twitter_handle','title','variety','winery'],axis=1)

In [15]:
description_cv_vector=cv.fit_transform(data_train['description'])

In [16]:
description_tf_vector=tf.fit_transform(data_train['description'])

In [17]:
print("Shape of Count Vectorized Descriptions ",description_cv_vector.shape)
print("Shape of TfIDF Vectorized Descriptions ",description_tf_vector.shape)

Shape of Count Vectorized Descriptions  (129971, 6000)
Shape of TfIDF Vectorized Descriptions  (129971, 6000)


In [23]:
data[data['points'].isna()==True].head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,country_code,designation_code,province_code,region_1_code,region_2_code,taster_name_code,taster_twitter_handle_code,variety_code


In [None]:
#Declaring PCA
inc_pca=PCA(n_components=5400,random_state=42) # Keeping it to 90% of the max_features
inc_pca.fit(X)
print("Explained Variance Ratio for PCA :",np.sum(inc_pca.explained_variance_ratio_))
#Declaring LDA
LDA_clf=LinearDiscriminantAnalysis()
LDA_clf.fit(X,)
print("Explained Variance Ratio of LDA :",np.sum(LDA_clf.explained_variance_ratio_))
#Declaring SVD
SVD_clf=TruncatedSVD()
#fit the training data
SVD_clf.fit(X,data['points']) # using data points as a target variable 
print("Explained Variance Ratio of SVD :",np.sum(SVD_clf.explained_variance_ratio_))
