In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
import tensorflow as tf
from keras.layers import Dense
from keras.models import Sequential

Using TensorFlow backend.


In [2]:
#### This is for the growth of the memory usage as is needed by the process ####### 
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
####  loading data ######
train=pd.read_csv('D:/My stuff/ML TASK INTERNSHALA/Knight ML Assignment/Data/train.csv')
test=pd.read_csv('D:/My stuff/ML TASK INTERNSHALA/Knight ML Assignment/Data/test.csv')
print(train.shape,test.shape)

(82657, 12) (20665, 11)


In [4]:
##### dropping user name as it is not relevant for the features ###########
train=train.drop(['user_name'],axis=1)

In [5]:
##### Search years and make another column for years ######
import re
year = []  
for value in train['review_title']:
    res = re.search(r'19\d{2}|20\d{2}', value)
    if res:
        year.append(res.group())
    else: year.append(None)

train['year'] = year

In [6]:
##### Removing rows which dont have years #######
df_final=train
df_final=df_final.dropna(subset=['year'])
print('Removed ' + str(train.shape[0]-df_final.shape[0]) + ' rows with empty year values.' + "\n")

df_final['year']=str(df_final['year'])


print(df_final['year'].describe())


Removed 2528 rows with empty year values.

count                                                 80129
unique                                                    1
top       0        2007\n1        2014\n2        2007\n3...
freq                                                  80129
Name: year, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
df_final.year.isnull().describe()

count     80129
unique        1
top       False
freq      80129
Name: year, dtype: object

In [8]:
######## Adding all the field together so as to gather all the necessary information ######
df_final.fillna(' ')
df_final['new']=df_final['country'].astype(str)+' '+df_final['review_title'].astype(str)+' '+df_final['review_description'].astype(str)+' '+df_final['designation'].astype(str)+' '+df_final['province'].astype(str)+' '+df_final['region_1'].astype(str)+' '+df_final['winery'].astype(str)+' '+df_final['year'].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
df_final.new.isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
82652    False
82653    False
82654    False
82655    False
82656    False
Name: new, Length: 80129, dtype: bool

In [10]:
#### stopword list ######
sw = stopwords.words('english')
X=df_final['new']
print(X[0])

Australia Andrew Peace 2007 Peace Family Vineyard Chardonnay (South Eastern Australia) Classic Chardonnay aromas of apple, pear and hay lead into a palate marked by decent intensity but also a bit of sweetness. Orange and candy notes run through the rather short finish. Peace Family Vineyard Australia Other South Eastern Australia Andrew Peace 0        2007
1        2014
2        2007
3        2010
4        2012
         ... 
82652    2007
82653    2008
82654    2014
82655    2011
82656    2010
Name: year, Length: 80129, dtype: object


In [11]:
#### making a list of word that will help in the classification ######
list_of_words = []
for phase_word in X:
    list_of_words.append(' '.join([re.sub('[^a-zA-Z0-9]', '', word) for word in phase_word.split() if not word in sw]))
X = list_of_words

In [12]:
#### using the TFIDF as the countvectorization ######
tfidf = TfidfVectorizer(min_df=5)
X = tfidf.fit_transform(X)

In [13]:
### Extracting the variety and making the y variable that we will predict ####
y=df_final['variety']
df_final=df_final.drop(['variety'],axis=1)
### label encoding the data #####
labelEncoder = LabelEncoder()
y = labelEncoder.fit_transform(y)
### Splitting data into train and test ######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [14]:
#### Model creation ######
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=len(tfidf.get_feature_names())))
model.add(Dense(units=y.max()+1, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x264f26d6b88>

In [15]:
### predictions and accuracy ####
scores = model.evaluate(X_test, y_test, verbose=1)
print ('The accuracy of the model is %s' % scores[1])

The accuracy of the model is 0.9561337828636169


In [29]:
## formatting the test data correctly to input into the model ######
def correctFormat(df):
    df=df.drop(['user_name'],axis=1)
    year = []  
    for value in df['review_title']:
        res = re.search(r'19\d{2}|20\d{2}', value)
        if res:
            year.append(res.group())
        else: year.append(None)
    df['year'] = year
    df_final=df
    df_final=df_final.dropna(subset=['year'])
    df_final['year']=str(df_final['year'])
    df_final.fillna(' ')
    df_final['new']=df_final['country'].astype(str)+' '+df_final['review_title'].astype(str)+' '+df_final['review_description'].astype(str)+' '+df_final['designation'].astype(str)+' '+df_final['province'].astype(str)+' '+df_final['region_1'].astype(str)+' '+df_final['winery'].astype(str)+' '+df_final['year'].astype(str)
    sw = stopwords.words('english')
    X=df_final['new']
    list_of_words = []
    for phase_word in X:
        list_of_words.append(' '.join([re.sub('[^a-zA-Z0-9]', '', word) for word in phase_word.split() if not word in sw]))
    X = list_of_words
    #tfidf = TfidfVectorizer(min_df=5)
    X = tfidf.transform(X)
    return X

In [30]:
test_format=correctFormat(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [31]:
print(test_format.shape,X_train.shape)

(20059, 18396) (64103, 18396)


In [32]:
### predicting on test cases #####
predictions_test=model.predict_classes(test_format)


In [36]:
pd_t=predictions_test
pd_t_label=labelEncoder.inverse_transform(pd_t)

In [37]:
pd_t_label[0]

'Pinot Noir'

In [41]:
### Saving the predictions into txt file ##### 
f=open('predictions_for_testcsv.txt','w')
for i in range(len(pd_t_label)):
    f.write(str(pd_t_label[i])+'\n')
    
    
f.close()

In [42]:
## saving the model #####
model.save('model_withoutKfold.h5')