## WIKIMEDIA - Image/Caption Matching 
### We shall do a bit of EDA using various tools
#### We shall use Autoviz , SweetViz and then do a bit of analysis using standard techniques

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load the libraries

In [None]:
import os
import requests

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import PIL.Image
import cv2

from IPython.display import Image, display

import urllib

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### List the files

In [None]:
os.listdir('../input/wikipedia-image-caption/')

### Load the Main File

In [None]:
test_file = pd.read_csv('../input/wikipedia-image-caption/test.tsv', sep='\t')
test_file

In [None]:
wiki_df = pd.read_csv('../input/wikipedia-image-caption/image_data_test/image_pixels/test_image_pixels_part-00000.csv', 
                      sep='\t', names=['image_url', 'b64_bytes', 'metadata_url'])
print(wiki_df)


### Load the submission file

In [None]:
sub_file = pd.read_csv('../input/wikipedia-image-caption/sample_submission.csv')
sub_file

### Check the size and shape

In [None]:
print(wiki_df.shape)
print(sub_file.shape)

### Install the libraries

In [None]:
!pip install autoviz xlrd

### Load the class

In [None]:
import pandas as pd
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()

In [None]:
filename = "../input/wikipedia-image-caption/train-00001-of-00005.tsv"
sep = "\t"
dft = AV.AutoViz(
    filename,
    sep=sep,
    depVar="",
    dfte=None,
    header=0,
    verbose=0,
    lowess=False,
    chart_format="svg",
    max_rows_analyzed=15000,
    max_cols_analyzed=30,
)

### Try the EDA

In [None]:
import matplotlib.pyplot as plt
import squarify    # pip install squarify (algorithm for treemap)
# plot it
squarify.plot(sizes=test_file['language'].value_counts().values, 
              label=test_file['language'].value_counts().index, 
              color=["green","violet","yellow", "blue"],
              alpha=.8 )
plt.axis('off')
plt.show()

In [None]:
# Code inspired from by  Georgii Sirotenko  https://www.kaggle.com/georgiisirotenko/pytorch-fish-outliers-handling-test-100 & https://www.kaggle.com/mpwolke/wikimedia-urllib

import plotly.graph_objects as go    

fig = go.Figure(
    data=[ go.Bar(x=test_file['language'].value_counts().index, 
            y=test_file['language'].value_counts().values,
            text=test_file['language'].value_counts().values,
            textposition='auto',name='hist', marker_color='skyblue')],
    layout_title_text="WikiMedia Image Dataset Language Distribution"
)
fig.show()

### Load one of the image files

In [None]:
image_file = pd.read_csv('../input/wikipedia-image-caption/image_data_test/image_pixels/test_image_pixels_part-00004.csv', 
                         sep='\t', names=['image_url', 'b64_bytes', 'metadata_url'])
image_file

### Define a function for loading images - 12 at a time

In [None]:
def showimages(imagelist):
    f, ax = plt.subplots(4,3, figsize=(18,12))
    image_flag=False
    for i, image_id in enumerate(imagelist):
        print(i, image_id)
        with urllib.request.urlopen(image_id) as url:
            if (image_id.lower().find('.svg') != -1):
                print ("Contains given SVG file ")
                image_flag=True
###         if (image_id.lower().find('.tiff') != -1):
###                print ("Contains given TIFF file ")
###                image_flag=True 
###            if (image_id.lower().find('.tif') != -1):
###                print ("Contains given TIF file ")
###                image_flag=True 
###
            if (image_flag == False):
                with open('./temp.jpg', 'wb') as f:
                    f.write(url.read())
        
        if (image_flag == False):
            imagetoshow=PIL.Image.open('./temp.jpg')
            print(imagetoshow)
            ax[i//3, i%3].imshow(imagetoshow) 
            ax[i//3, i%3].axis('off')
    plt.show() 

## Select 12 files at a time

In [None]:
manualdisplay=image_file.image_url[90:102].values
showimages(manualdisplay)

## Let us randomize

In [None]:
import random

start_num=random.randrange(0, len(image_file)-30)
end_num = start_num + 12
imagelist=image_file.image_url[start_num:end_num].values
print(imagelist.dtype)
for index, image in enumerate(imagelist):
    if (image.find('.svg') != -1):
        print ("Contains given SVG file ")
        imagelist[index] =  imagelist[index-1] #work to be done
        
showimages(imagelist)

## Perhaps a word cloud?

In [None]:
file_name = pd.read_csv('../input/wikipedia-image-caption/train-00001-of-00005.tsv', 
                        sep='\t',nrows=3000)
file_name.head(5)

In [None]:
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
from PIL import Image

kaggle_mask = np.array(Image.open('../input/kaggle/kaggle-logo.png'))
#kaggle_mask = np.array(Image.open('../input/kaggle/kaggle-transparent.svg'))
fig = plt.figure()
fig.set_figwidth(10)
fig.set_figheight(15)
plt.imshow(kaggle_mask, cmap=plt.cm.gray, interpolation='bilinear') 
plt.axis('off')
#plt.show()

kaggle_wc= WordCloud(background_color='black',max_words = 3000,stopwords='site', mask = kaggle_mask)
kaggle_wc.generate(" ".join(file_name['page_title'].astype(str)))
fig=plt.figure()
fig.set_figwidth(20)
fig.set_figheight(16)
plt.axis('off')
plt.imshow(kaggle_wc, interpolation='bilinear')
plt.show()

In [None]:
embed_file_sample_df=pd.read_csv('../input/wikipedia-image-caption/image_data_test/resnet_embeddings/test_resnet_embeddings_part-00001.csv')
pixel_file_sample_df=pd.read_csv('../input/wikipedia-image-caption/image_data_test/image_pixels/test_image_pixels_part-00002.csv')

print(embed_file_sample_df.head(2))
print(embed_file_sample_df.columns)

print(pixel_file_sample_df.head(2))
print(pixel_file_sample_df.columns)


In [None]:
file_name.columns

In [None]:
check_cols = ['language', 'mime_type', 'original_height', 
              'original_width', 'is_main_image','page_changed_recently']
for cols in check_cols:
    print(file_name[cols].unique())

In [None]:
#temp_df=file_name[[check_cols]]
temp_df1= file_name.iloc[:, 0]
temp_df2= file_name.iloc[:, 9:14]
temp_df3=pd.concat([temp_df1, temp_df2.reindex(temp_df2.index)], axis=1)
#,file_name.iloc[:,9:12])
temp_df3.head()

In [None]:
import seaborn as sns
#temp_df3.boxplot(by='language')
temp_df3_group=temp_df3.groupby('language').agg('min')

#temp_df3_group.columns
temp_df3_group['original_height'].plot(label = 'original_height', figsize = (20,16))
temp_df3_group['original_width'].plot(label = 'original_width', figsize = (20,16))
plt.legend()
plt.show()
temp_df3_group['mime_type'].value_counts().plot.bar(label='mime_type')
plt.legend()
plt.show()
temp_df3_group['is_main_image'].value_counts().plot.bar(label='is_main_image')
plt.legend()
plt.show()
temp_df3_group['attribution_passes_lang_id'].value_counts().plot.bar(label='attribution_passes_lang_id')
plt.legend()
plt.show()

graph_df = pd.concat([temp_df3_group['mime_type'].value_counts(), 
                temp_df3_group['is_main_image'].value_counts(),
                temp_df3_group['attribution_passes_lang_id'].value_counts()], 
               axis=1, sort=True)
graph_df.columns = ["Mime", "Main Image", "Attribution Passes"]
graph_df.plot.bar(figsize = (20,16))
plt.legend()
plt.show()



# More exploration to come

### Thank you! 