# Data Mining project: Discover and describe areas of interest and events from geo-located data

## 1. Import Dataset and Libraries

In [1]:
# load pandas to deal with the data
import pandas as pd
# plotting
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# load data from table file where entries are separated with a space
data = pd.read_table("flickr_data2.csv", sep=",", low_memory=False)

data.columns = data.columns.str.strip()

print(data.columns)
print(data.info())
print(data.describe())
data.head()

Index(['id', 'user', 'lat', 'long', 'tags', 'title', 'date_taken_minute',
       'date_taken_hour', 'date_taken_day', 'date_taken_month',
       'date_taken_year', 'date_upload_minute', 'date_upload_hour',
       'date_upload_day', 'date_upload_month', 'date_upload_year',
       'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420240 entries, 0 to 420239
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  420240 non-null  int64  
 1   user                420240 non-null  object 
 2   lat                 420240 non-null  float64
 3   long                420240 non-null  float64
 4   tags                316730 non-null  object 
 5   title               381911 non-null  object 
 6   date_taken_minute   420239 non-null  float64
 7   date_taken_hour     420240 non-null  int64  
 8   date_taken_day      420240 non-null 

Unnamed: 0,id,user,lat,long,tags,title,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,date_taken_year,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,4395181099,30624617@N03,45.754858,4.82171,"chair,lyon,rhône,chaise,rhônealpes",Chaises avec vue,11.0,15,28,2,2010,23,20,28.0,2,2010.0,,,
1,4394748717,35853470@N00,45.75327,4.862953,,,51.0,17,28,2,2010,52,17,28.0,2,2010.0,,,
2,4394694699,11817998@N05,45.760655,4.846564,"365,iphone",59/365 - R46 V103 B163,29.0,17,28,2,2010,33,17,28.0,2,2010.0,,,
3,4394803790,11545749@N06,45.784,4.874072,"nin,nineinchnails,gift,screening,toiou,avott",2010-01-29 Toiou Avott Lyon,15.0,20,28,1,2010,38,12,28.0,2,2010.0,,,
4,4394803554,11545749@N06,45.784,4.874072,"lyon,nin,nineinchnails,gift,screening,toiou,avott",2010-01-28 Toiou Avott Lyon,10.0,20,28,1,2010,38,12,28.0,2,2010.0,,,


## Perform Exploratory Data Analysis
First, we will explore the most common **data quality issues**:
* missing-vals
* duplicates

Second, we will use [**descriptive statistics**](#desc-stats) to have get a statistical summary of the data. 

We will then use [**data visualisaiton**](#data-vis) to get a better understanding of the data.

### Wrong type values

In [3]:
# Affiche la taille initiale
print(f"Initial: {len(data)}")
print(data.iloc[98811])

# Dictionnaire des règles de validation
validation_rules = {
    'lat': lambda x: pd.api.types.is_number(x) and 45.69804 <= x <= 45.79972,
    'long': lambda x: pd.api.types.is_number(x) and 4.76075 <= x <= 4.93901,
    'date_taken_minute': lambda x: pd.api.types.is_number(x) and 0 <= x <= 59,
    'date_taken_hour': lambda x: pd.api.types.is_number(x) and 0 <= x <= 23,
    'date_taken_day': lambda x: pd.api.types.is_number(x) and 1 <= x <= 31,
    'date_taken_month': lambda x: pd.api.types.is_number(x) and 1 <= x <= 12,
    'date_taken_year': lambda x: pd.api.types.is_number(x) and 1900 <= x <= 2100,
    'date_upload_minute': lambda x: pd.api.types.is_number(x) and 0 <= x <= 59,
    'date_upload_hour': lambda x: pd.api.types.is_number(x) and 0 <= x <= 23,
    'date_upload_day': lambda x: pd.api.types.is_number(x) and 1 <= x <= 31,
    'date_upload_month': lambda x: pd.api.types.is_number(x) and 1 <= x <= 12,
    'date_upload_year': lambda x: pd.api.types.is_number(x) and 1900 <= x <= 2100,
}

# Fonction de nettoyage des colonnes
def clean_column(dataframe, column_name, validation_func):
    dataframe[column_name] = dataframe[column_name].apply(
        lambda x: x if validation_func(x) else np.nan
    )

# Appliquer les règles de validation à chaque colonne
for column, rule in validation_rules.items():
    if column in data.columns:
        clean_column(data, column, rule)

print(data.iloc[98811])
data.head()

Initial: 420240
id                                                           7387935070
user                                                       68256211@N06
lat                                                            45.76869
long                                                           4.843872
tags                  portrait,girl,canon,spring,lyon,parade,fille,p...
title                                                    Gay Pride 2012
date_taken_minute                                                  58.0
date_taken_hour                                                      15
date_taken_day                                                       16
date_taken_month                                                      6
date_taken_year                                                    2012
date_upload_minute                                                    2
date_upload_hour                                                     20
date_upload_day                                 

Unnamed: 0,id,user,lat,long,tags,title,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,date_taken_year,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,4395181099,30624617@N03,45.754858,4.82171,"chair,lyon,rhône,chaise,rhônealpes",Chaises avec vue,11.0,15.0,28.0,2.0,2010.0,,,28.0,2.0,2010.0,,,
1,4394748717,35853470@N00,45.75327,4.862953,,,51.0,17.0,28.0,2.0,2010.0,,,28.0,2.0,2010.0,,,
2,4394694699,11817998@N05,45.760655,4.846564,"365,iphone",59/365 - R46 V103 B163,29.0,17.0,28.0,2.0,2010.0,,,28.0,2.0,2010.0,,,
3,4394803790,11545749@N06,45.784,4.874072,"nin,nineinchnails,gift,screening,toiou,avott",2010-01-29 Toiou Avott Lyon,15.0,20.0,28.0,1.0,2010.0,,,28.0,2.0,2010.0,,,
4,4394803554,11545749@N06,45.784,4.874072,"lyon,nin,nineinchnails,gift,screening,toiou,avott",2010-01-28 Toiou Avott Lyon,10.0,20.0,28.0,1.0,2010.0,,,28.0,2.0,2010.0,,,


### Missing Values

To check the missing values, several approaches can be used:

1. The `info()` mwthods provides a summary of a dataframe in terms of the types of values, non-null values and memory usage. Thus, by comparing the number of non-null values of each column with the total number of entries, one can have an idea of missing values.
2. Using the [`isna()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isna.html) method. By summing the resulting values, we obtain the number of null values for each column.
3. To get the rows with any missing values, you can use `isna()` followed by `any(axis=1)`.

In [4]:
print(f"Initial: {len(data)}")
# remove rows with missing values on the columns id, lat, and long
data_cleaned_missing_values = data.dropna(subset=['id', 'lat', 'long'])
print(f"After removing missing values: {len(data_cleaned_missing_values)}")

Initial: 420240
After removing missing values: 355491


### Removing duplicates

In [6]:
# remove duplicates
print(f"Initial: {len(data_cleaned_missing_values)}")
print(data_cleaned_missing_values.duplicated().sum())
data_cleaned_duplicates = data_cleaned_missing_values.drop_duplicates(subset=['id', 'lat', 'long'],keep='first')
# show the stats
print(f"After removing duplicates: {len(data_cleaned_duplicates)}")

Initial: 355491
213891
After removing duplicates: 141596


### Descriptive Statistics

To obtain the statistical summary of the dataframe, we can use [`describe()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html). For different columns, it displays the count, the average value, the standard deviation, the min and max values, percentiles. 
By default, in mixed data types DataFrames, it displays the values for quantative data only:

In [7]:
data_cleaned_duplicates.describe()

Unnamed: 0,id,lat,long,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,date_taken_year,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year,Unnamed: 16,Unnamed: 17,Unnamed: 18
count,141596.0,141596.0,141596.0,141552.0,141566.0,141595.0,141571.0,141548.0,0.0,0.0,141594.0,141569.0,141549.0,46.0,0.0,1.0
mean,20181080000.0,45.761139,4.836779,29.386063,14.862587,15.091006,7.116387,2014.058065,,,15.347162,6.852425,2014.419501,1928.086957,,2012.0
std,13732510000.0,0.014849,0.020774,17.450493,4.990502,8.694537,3.406928,3.108261,,,8.448202,3.505216,2.766905,411.524041,,
min,306667500.0,45.698056,4.760769,0.0,0.0,1.0,1.0,1926.0,,,1.0,1.0,2009.0,12.0,,2012.0
25%,8046895000.0,45.757525,4.826516,14.0,12.0,8.0,4.0,2012.0,,,8.0,4.0,2012.0,2013.0,,2012.0
50%,15826600000.0,45.762366,4.832997,29.0,15.0,14.0,7.0,2014.0,,,15.0,7.0,2014.0,2015.0,,2012.0
75%,31481420000.0,45.768595,4.84401,44.0,18.0,23.0,10.0,2017.0,,,23.0,10.0,2017.0,2016.0,,2012.0
max,49148090000.0,45.79971,4.938987,59.0,23.0,31.0,12.0,2019.0,,,31.0,12.0,2019.0,2019.0,,2012.0


## Prepare data for clustering

First, we will droping the columns user, tag and title because they are not necessary for geographic clustering

In [8]:
df_clustering = data_cleaned_duplicates.drop(columns=['user'])
df_clustering = df_clustering.drop(columns=['tags'])
df_clustering = df_clustering.drop(columns=['title'])

df_clustering.head()

Unnamed: 0,id,lat,long,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,date_taken_year,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,4395181099,45.754858,4.82171,11.0,15.0,28.0,2.0,2010.0,,,28.0,2.0,2010.0,,,
1,4394748717,45.75327,4.862953,51.0,17.0,28.0,2.0,2010.0,,,28.0,2.0,2010.0,,,
2,4394694699,45.760655,4.846564,29.0,17.0,28.0,2.0,2010.0,,,28.0,2.0,2010.0,,,
3,4394803790,45.784,4.874072,15.0,20.0,28.0,1.0,2010.0,,,28.0,2.0,2010.0,,,
4,4394803554,45.784,4.874072,10.0,20.0,28.0,1.0,2010.0,,,28.0,2.0,2010.0,,,


Let's apply a [`StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html). Recall, that for a given value `x`, a standard score is given by $z = \frac{x - mean(\mathbf{x})}{std(\mathbf{x})}$ 

In [9]:
# scaler
from sklearn.preprocessing import StandardScaler

In [10]:
# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_clustering)
# show
print(scaled_data)
# create a DataFrame
scaled_data_df = pd.DataFrame(data=scaled_data, columns=df_clustering.columns)
scaled_data_df.head()

print(len(scaled_data_df))

[[-1.14953145 -0.42300291 -0.72538134 ...         nan         nan
          nan]
 [-1.14956294 -0.52994641  1.25994429 ...         nan         nan
          nan]
 [-1.14956687 -0.0326053   0.47102249 ...         nan         nan
          nan]
 ...
 [ 2.09820947  1.39564253  1.85987915 ...         nan         nan
          nan]
 [ 1.79453936  0.50918336  0.17921433 ...         nan         nan
          nan]
 [ 1.80695534  0.72145406 -0.17637572 ...         nan         nan
          nan]]
141596


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


## Hierarchical Clustering

In [11]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

In [13]:
linkages = ['complete', 'average', 'single']
for linkage_type in linkages:
    # Appliquer AgglomerativeClustering
    cluster = AgglomerativeClustering(n_clusters=3, linkage=linkage_type)
    cluster_labels = cluster.fit_predict(scaled_data_df)

    # Calculer le score de silhouette
    silhouette_avg = silhouette_score(scaled_data_df, cluster_labels)
    print(f"Silhouette score for {linkage_type} linkage: {silhouette_avg:.4f}")

ValueError: Input X contains NaN.
AgglomerativeClustering does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values