## In this notebook we will explore and process the dataset and implement ML models for classifying images based on text

In [10]:
# Import libraries
import pandas as pd
import numpy as np
import demoji
import nltk
import os

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import multioutput
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
#ML models
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pickle
import re,string

import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/sanket/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sanket/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sanket/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## ETL
### In this part we will explore the data, processing and feature extraction before diving into the ML side

In [None]:
def strip_all_entities(text):
    entity_prefixes = ['@','@_','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [74]:
# This class is used to process the data.
class DataProcessing:
    def __init__(self,df):
        '''
        Input
        df: datafame
        Initialize the class instance object variable df with data frame
        '''
        self.df = df
    
    def extract_img_url(self,df):
        df['urls'] = df['urls'].apply(lambda x: x[0].split('/')[-1]if len(x)>0 else np.nan)
        self.df = df
        
    def extract_thumbnail_url(self,df):
        df['thumbnail_src'] = df['thumbnail_src'].apply(lambda x: x.split('/')[-1])
        self.df = df
        
        
    def drop_column(self,df,cols=None):
        df.drop(cols, axis=1,inplace=True)
        self.df = df
    
    def extract_text(self,df):
        df['edge_media_to_caption'] = df['edge_media_to_caption'].apply(lambda x: demoji.
                                                                    replace(x['edges'][0]['node']['text']) 
                                                                    if len(x['edges']) > 0 else np.nan )
        df['edge_media_to_caption'] = df['edge_media_to_caption'].apply(lambda x:x.replace("\n"," ") if x!=np.nan else x)
        self.df = df
    
    def process_tags(self,df):
        df['tags'] = df['tags'].apply(lambda x: list(x))
        df['tags'] = df['tags'].apply(lambda x: ' '.join(x))
        self.df = df
    
    def keep_rows(self,df,images):
        df = df[df['thumbnail_src'].isin(images)]
        self.df = df
        
    def add_label(self):
        self.df['pavbhaji']=np.nan
        self.df['pavbhaji'] = self.df['thumbnail_src'].apply(lambda x: 1 if x in pavbhaji_images else 0)
        
    def extract_caption(self,df):
        df['caption'] = df['edge_media_to_caption'].apply(lambda x: strip_all_entities(x))
        self.df = df
    
    def add_tag_and_caption(self,df):
        df['text'] = df['tags']+ " " + df['caption']
        self.df = df

In [91]:
df = pd.read_json('dataset/pavbhaji.json')

In [92]:
pavbhaji_images = os.listdir('dataset/images/0')
nopavbhaji_images = os.listdir('dataset/images/1')
all_images = pavbhaji_images + nopavbhaji_images

In [93]:
df.head()

Unnamed: 0,dimensions,display_url,edge_liked_by,edge_media_preview_like,edge_media_to_caption,edge_media_to_comment,id,is_video,location,owner,shortcode,tags,taken_at_timestamp,thumbnail_resources,thumbnail_src,urls,video_view_count,comments_disabled
0,"{'height': 734, 'width': 640}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/352...,{'count': 3797},{'count': 3797},{'edges': [{'node': {'text': 'TAG A PAV BHAJI ...,{'count': 52},1834712933156555776,True,,{'id': '1919686029'},Bl2NbElnIva,"[vadapav, foodgram, foodphotography, foodblogg...",1532934873,"[{'config_height': 150, 'config_width': 150, '...",https://instagram.fpnq3-1.fna.fbcdn.net/vp/cb5...,[https://instagram.fpnq3-1.fna.fbcdn.net/vp/89...,0.0,
1,"{'height': 750, 'width': 750}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/b0d...,{'count': 12041},{'count': 12041},{'edges': [{'node': {'text': 'देसी स्टाइल पाव ...,{'count': 325},1826000656302706176,True,"{'has_public_page': True, 'id': '245717485', '...",{'id': '1445587278'},BlXQewejY3Z,"[healthyfood, sokolkata, mumbaifoodie, faridab...",1531897016,"[{'config_height': 150, 'config_width': 150, '...",https://instagram.fpnq3-1.fna.fbcdn.net/vp/ad3...,[https://instagram.fpnq3-1.fna.fbcdn.net/vp/9c...,0.0,
2,"{'height': 800, 'width': 640}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/cb4...,{'count': 3544},{'count': 3544},{'edges': [{'node': {'text': 'Pav Bhaji😍😍😋\n.\...,{'count': 18},1849962218902145792,True,"{'has_public_page': True, 'id': '213724095', '...",{'id': '4759392841'},BmsYtYpjkq4,"[foodphotography, mumbaipeople, vascom, delhib...",1534752790,"[{'config_height': 150, 'config_width': 150, '...",https://instagram.fpnq3-1.fna.fbcdn.net/vp/64f...,[https://instagram.fpnq3-1.fna.fbcdn.net/vp/5e...,0.0,
3,"{'height': 937, 'width': 750}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/180...,{'count': 1388},{'count': 1388},{'edges': [{'node': {'text': 'Asli Makhan Pav ...,{'count': 91},1828963273137213696,True,"{'has_public_page': True, 'id': '234730336', '...",{'id': '4628040416'},BlhyGgalFDw,"[foodvideo, misscravingbuster, foodblogger, mu...",1532249632,"[{'config_height': 150, 'config_width': 150, '...",https://instagram.fpnq3-1.fna.fbcdn.net/vp/802...,[https://instagram.fpnq3-1.fna.fbcdn.net/vp/33...,0.0,
4,"{'height': 936, 'width': 750}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/5ee...,{'count': 223},{'count': 223},{'edges': [{'node': {'text': 'Tag a Pav bhaji ...,{'count': 14},1852265507774756864,True,,{'id': '7798081590'},Bm0kapkn3ha,[],1535027282,"[{'config_height': 150, 'config_width': 150, '...",https://instagram.fpnq3-1.fna.fbcdn.net/vp/292...,[https://instagram.fpnq3-1.fna.fbcdn.net/vp/6a...,0.0,


In [94]:
df.shape

(1500, 18)

In [95]:
process = DataProcessing(df)

In [96]:
process.extract_img_url(df)

In [97]:
process.extract_thumbnail_url(df)

In [98]:
process.keep_rows(df,all_images)

In [99]:
df = process.df

In [100]:
process.extract_text(df)

In [101]:
df = process.df

In [103]:
process.add_label()

In [110]:
process.process_tags(df)

In [111]:
df = process.df

In [105]:
df.head()

Unnamed: 0,dimensions,display_url,edge_liked_by,edge_media_preview_like,edge_media_to_caption,edge_media_to_comment,id,is_video,location,owner,shortcode,tags,taken_at_timestamp,thumbnail_resources,thumbnail_src,urls,video_view_count,comments_disabled,pavbhaji
17,"{'height': 1155, 'width': 1080}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/700...,{'count': 45},{'count': 45},Chicken Tikka . . . Follow @dilli_ki_teekhi_m...,{'count': 5},1855224025926588672,False,,{'id': '7815521541'},Bm_FGwUH9l9,"[westbengal, iphone, delhifoodie, sokolkata, m...",1535379959,"[{'config_height': 150, 'config_width': 150, '...",39790065_708138802879611_4373499256883904512_n...,39790065_708138802879611_4373499256883904512_n...,,0.0,1
18,"{'height': 1080, 'width': 1080}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/da9...,{'count': 215},{'count': 215},"Hello frandz, pav bhaji khaalo Garam hai Ye a...",{'count': 0},1855211863837300480,False,"{'has_public_page': True, 'id': '498870164', '...",{'id': '4846807954'},Bm_CVxfHNd-,"[foodgram, foodphotography, foodblogger, foodm...",1535378509,"[{'config_height': 150, 'config_width': 150, '...",39205669_548076665624561_2856530375738392576_n...,39205669_548076665624561_2856530375738392576_n...,,0.0,0
19,"{'height': 1350, 'width': 1080}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/a20...,{'count': 46},{'count': 46},Follow @dilliciousfoodie @dilliciousfoodie @...,{'count': 0},1855207036881438720,False,"{'has_public_page': True, 'id': '1946652909695...",{'id': '5689462323'},Bm_BPiCH4hM,"[foodphotography, bhaji, pasta, fries, f52gram...",1535377934,"[{'config_height': 150, 'config_width': 150, '...",39928567_2025694197741778_2817802553157661723_...,39928567_2025694197741778_2817802553157661723_...,,0.0,1
20,"{'height': 750, 'width': 750}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/b5f...,{'count': 741},{'count': 741},We’ve got you some delicious evening snacks . ...,{'count': 2},1855194071632941056,False,,{'id': '1996092261'},Bm--S3NF5Bi,"[UpcyclingFood, Twist, LeftoverPavBhajiBreadPa...",1535376388,"[{'config_height': 150, 'config_width': 150, '...",39991006_472180219962850_7601786252118982656_n...,39991006_472180219962850_7601786252118982656_n...,,0.0,1
21,"{'height': 565, 'width': 1080}",https://instagram.fpnq3-1.fna.fbcdn.net/vp/6a2...,{'count': 28},{'count': 28},People who love food are the best . . . . . ....,{'count': 0},1855192456304594176,False,"{'has_public_page': True, 'id': '239380854', '...",{'id': '3421689455'},Bm-97W0BQyn,"[foodphotography, lonidosa, tbt, foodie, mahar...",1535376196,"[{'config_height': 150, 'config_width': 150, '...",37158549_528309647598249_6068909533663592448_n...,37158549_528309647598249_6068909533663592448_n...,,0.0,1


## Using only tags to train the model for image classification

In [116]:
X = df['tags']
y = df.iloc[:,-1]
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [114]:
def tokenize(text):
    '''
    Input
    text: take the text as input
    Output
    words_lemmed: tokenized and lemmatized text with stop words removed 
    '''
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    stop_words = stopwords.words("english")
    
    #tokenize
    words = word_tokenize (text)
    words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in words if w not in stop_words]
    return words_lemmed

In [117]:
vect = CountVectorizer(tokenizer=tokenize)
X = vect.fit_transform(X)
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X)

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 22)

In [124]:
pipeline = Pipeline([('scale',StandardScaler(with_mean=False)),
                     ('clf', LinearSVC())])

In [128]:
search_space = [{'clf':[LinearSVC()]},
                
                {'clf': [LogisticRegression(solver='sag',random_state=22)]},
                
                {'clf': [MultinomialNB()]},
                {'clf':[RandomForestClassifier(n_estimators=200, max_depth=3, random_state=22)]}]

In [129]:
cv = GridSearchCV(pipeline, search_space)

In [130]:
cv.fit(X_train,y_train)


Liblinear failed to converge, increase the number of iterations.


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge



GridSearchCV(estimator=Pipeline(steps=[('scale',
                                        StandardScaler(with_mean=False)),
                                       ('clf', LinearSVC())]),
             param_grid=[{'clf': [LinearSVC()]},
                         {'clf': [LogisticRegression(random_state=22,
                                                     solver='sag')]},
                         {'clf': [MultinomialNB()]},
                         {'clf': [RandomForestClassifier(max_depth=3,
                                                         n_estimators=200,
                                                         random_state=22)]}])

In [131]:
pred = cv.predict(X_test)

In [133]:
np.mean(pred == np.array(y_test))

0.6460176991150443

## Using deep learning model for classification

In [161]:
x_train_array = X_train.toarray()
y_train_array = np.array(y_train)
x_test_array = X_test.toarray()
y_test_array = np.array(y_test)

In [159]:
from keras.models import Sequential
from keras import layers
input_dim = X_train.shape[1]  # Number of features
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [162]:
scaler = StandardScaler()
# transform data
x_train_array = scaler.fit_transform(x_train_array)
x_test_array = scaler.fit_transform(x_test_array)

In [163]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 10)                25640     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 11        
Total params: 25,651
Trainable params: 25,651
Non-trainable params: 0
_________________________________________________________________


In [164]:
history = model.fit(x_train_array, y_train_array,epochs=100,verbose=False,validation_data=(x_test_array, y_test_array),batch_size=10)

In [165]:
loss, accuracy = model.evaluate(x_train_array, y_train_array, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test_array, y_test_array, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9853
Testing Accuracy:  0.6637
