In [131]:
#Loading Packages
import pandas as pd 
import numpy as np                     # For mathematical calculations 
import seaborn as sns                  # For data visualization 
import matplotlib.pyplot as plt        # For plotting graphs 
%matplotlib inline 
import warnings   # To ignore any warnings 
warnings.filterwarnings("ignore")
%matplotlib inline  
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt 
pd.set_option('display.max_colwidth', None)


# Loading Packages and Dataset 

In [132]:
# loading dataset( with all row and column)
df2 = pd.read_csv('../input/age-dataset/AgeDataset-V1.csv')
df2.head(15)

In [133]:
df2.info()

In [134]:
# Drop all columns that have at least one missing value
print('DataFrame after dropping the columns having missing values:')
print(df2.dropna(axis=1))


# Database conclusion 

In [135]:
df2['Gender'] = df2['Gender']\
                    .replace("Transgender Person; Intersex; Transgender Male","Transgender Male")\
                    .replace("Transgender Male; Male","Transgender Male")\
                    .replace("Female; Female","Female")\
                    .replace("Transgender Female; Female","Transgender Female")


In [136]:
people_by_gender = df2.groupby(['Gender']).size().reset_index(name='Count')
people_by_gender.sort_values(by='Count', ascending=False)

In [137]:
people_by_gender = people_by_gender.sort_values(by='Count', ascending=True)
plt.barh(people_by_gender['Gender'], people_by_gender['Count'])
plt.show()


# Deadliest Years - Top 10

In [138]:
deaths_by_year = df2.groupby(['Death year']).size().reset_index(name='Count')
deaths_by_year = deaths_by_year.sort_values(by='Count', ascending=False).head(10)
deaths_by_year


# Death count per year

In [139]:
plt.barh(deaths_by_year['Death year'], deaths_by_year['Count'])
plt.show()


In [140]:
df2['Occupation'].describe()


# Discovering the specific area-based death rate


In [141]:
print("Politicians' Death Rate")
Male_count=df2[df2['Occupation']=='Politician']
Male_count['Gender'].value_counts()



In [142]:
print("Artist Death Rate")
Male_count=df2[df2['Occupation']=='Artist']
Male_count['Gender'].value_counts()



In [143]:
print("Astronomer Death Rate")
Male_count=df2[df2['Occupation']=='Astronomer']
Male_count['Gender'].value_counts()


In [144]:

print("Military personnel Death Rate")
Male_count=df2[df2['Occupation']=='Military personnel']
Male_count['Gender'].value_counts()


In [145]:
print("Politicians' Death Rate in United Kingdom ")

Male_count_country= df2[(df2['Occupation']=='Politician') & (df2['Country']=='United Kingdom')]
Male_count_country['Gender'].value_counts()



In [146]:
print("Politicians' Death Rate in United States of America")

Male_count_country= df2[(df2['Occupation']=='Politician') & (df2['Country']=='United States of America')]
Male_count_country['Gender'].value_counts()


In [147]:
print("Politicians' Death Rate in United States of France")

Male_count_country= df2[(df2['Occupation']=='Politician') & (df2['Country']=='France')]
Male_count_country['Gender'].value_counts()


In [148]:
print("Politicians' Death Rate in United Kingdom ")

Male_count_country= df2[(df2['Occupation']=='Politician') & (df2['Country']=='United Kingdom')]
Male_count_country['Gender'].value_counts()


In [149]:
df2['Occupation'].value_counts()


In [150]:
df2['Age of death'].value_counts()


# Using NLP on ['Short description'] column 

# * Text preprocessing


In [151]:
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import re
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim

from sklearn.cluster import DBSCAN

import scipy
from scipy.cluster import hierarchy

from sklearn.cluster import AgglomerativeClustering


In [152]:
import nltk
nltk.download('punkt')


In [153]:
print("Non-null Value")
df2 = df2[df2['Short description'].notnull()]
df2=df2.dropna(how='any')



# Gender Base Death

In [154]:
print("Gender base death ")
df2.groupby('Gender')['Age of death'].count().sort_values(ascending=True).tail(50).plot.barh(figsize=(15,8))

# Occupation Base Death On Gender


In [155]:
print("Occupation Base Death On Gender")
df2.groupby('Occupation')['Gender'].count().sort_values(ascending=True).tail(50).plot.barh(figsize=(30,18))


# Top 5 Occupation High Death Nomber with Gender

In [156]:
print(" Top 5 Occupation High Death Nomber with Gender ")
df_top6=df2[df2['Occupation'].isin(['Artist', 'Politician','Athlete','Military personnel','Researcher','Journalist'])]
df_top6.groupby(['Occupation','Gender'])['Age of death'].count().plot.barh(figsize=(18,10))


# Data Preparation for NLP


* Cleanig Text

In [157]:
nltk.download('stopwords')

sno = nltk.stem.SnowballStemmer('english')
stop=set(stopwords.words('english'))

def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(stop)
print('************************************')
print(sno.stem('japan'))



In [158]:
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in df2['Short description'].values:
    filtered_sentence=[]
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1


In [159]:
df2['Short description2']=final_string
df2['Short description2']=df2['Short description2'].str.decode("utf-8")


In [160]:
df2['Short description2']

# Clustering


# 1)-KMeans by Bag of Words(BOW)

In [161]:
count_vect = CountVectorizer()
bow = count_vect.fit_transform(df2['Short description2'].values)
bow.shape


In [162]:
bow

In [163]:
terms = count_vect.get_feature_names()
terms[1:10]


In [164]:
model = KMeans(n_clusters = 10,init='k-means++', random_state=99)
model.fit(bow)


In [165]:
labels = model.labels_
cluster_center=model.cluster_centers_


In [166]:
cluster_center

In [167]:
print("silhouette_score")
silhouette_score = metrics.silhouette_score(bow, labels, metric='euclidean')
silhouette_score

In [168]:
df2['Bow Clus Label'] = model.labels_
df2.head(2)


In [169]:
print(" Nomber Of Clusters")
plt.bar([x for x in range(10)], df2.groupby(['Bow Clus Label'])['Short description2'].count(), alpha = 0.4)
plt.title('KMeans cluster pounts')
plt.xlabel("Cluster number")
plt.ylabel("Number of points")
plt.show()


# Death groups people based on their occupations or identities

In [170]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = count_vect.get_feature_names()
for i in range(10):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
        print()


In [171]:
print("Country Base Clusters And Short Description ")
df_press_clus=df2.groupby(['Bow Clus Label','Country'])['Short description'].count()
df_press_clus=df_press_clus.reset_index()
df_press_clus=df_press_clus.set_index('Country')
df_press_clus


In [172]:
print("Name Base Clusters And Short Description ")
df_key_clus=df2.groupby(['Bow Clus Label','Name'])['Short description'].count()
df_key_clus=df_key_clus.reset_index()
df_key_clus=df_key_clus.set_index('Name')
df_key_clus


In [173]:
print("Top 10 Country Base Clusters And Short Description ")
df_press_clus[df_press_clus['Bow Clus Label']==0].sort_values(by=['Short description'],ascending=True).tail(10).plot.barh(y='Short description')


In [174]:
df_key_clus[df_key_clus['Bow Clus Label']==0].sort_values(by=['Short description'],ascending=True).tail(10).plot.barh(y='Short description')


# 2)-KMeans by TF-IDF


In [175]:
tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(df2['Short description2'].values)
tfidf.shape


In [176]:
model_tf = KMeans(n_clusters = 10,random_state=99)
model_tf.fit(tfidf)


In [177]:
labels_tf = model_tf.labels_
cluster_center_tf=model_tf.cluster_centers_


In [178]:
cluster_center_tf

In [179]:
terms1 = tfidf_vect.get_feature_names()


In [180]:
terms1[1:10]


In [181]:
silhouette_score_tf = metrics.silhouette_score(tfidf, labels_tf, metric='euclidean')
silhouette_score_tf


In [182]:
df3 = df2
df3['Tfidf Clus Label'] = model_tf.labels_
df3.head(5)


In [183]:
df3.groupby(['Tfidf Clus Label'])['Short description'].count()

In [184]:
print("Top terms per cluster:")
order_centroids = model_tf.cluster_centers_.argsort()[:, ::-1]
for i in range(10):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms1[ind], end='')
        print()


In [185]:
print(" Nomber Of Clusters")
plt.bar([x for x in range(10)], df3.groupby(['Tfidf Clus Label'])['Short description'].count(), alpha = 0.4)
plt.title('KMeans cluster points')
plt.xlabel("Cluster number")
plt.ylabel("Number of points")
plt.show()


In [186]:
print('Review Assigned To Cluster ')
for i in range(10):
    print("4 review of assigned to cluster ", i)
    print("-" * 70)
    print(df3.iloc[df3.groupby(['Tfidf Clus Label']).groups[i][5]]['Short description'])
    print('\n')
    print(df3.iloc[df3.groupby(['Tfidf Clus Label']).groups[i][10]]['Short description'])
    print('\n')
    print(df3.iloc[df3.groupby(['Tfidf Clus Label']).groups[i][20]]['Short description'])
    print('\n')
    print("_" * 70)


# * 3)-KMeans by Word2Vec


In [187]:
i=0
list_of_sent=[]
for sent in df3['Short description2'].values:
    list_of_sent.append(sent.split())


In [188]:
print(df2['Short description'].values[0])
print("*****************************************************************")
print(list_of_sent[0])


In [189]:
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned


In [190]:
i=0
list_of_sent_train=[]
for sent in df3['Short description2'].values:
    filtered_sentence=[]
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):    
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue 
    list_of_sent_train.append(filtered_sentence)


In [191]:
w2v_model=gensim.models.Word2Vec(list_of_sent_train,vector_size=100, workers=4)


In [192]:
sent_vectors = [];
for sent in list_of_sent_train:
    sent_vec = np.zeros(100)
    cnt_words =0;
    for word in sent:
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
sent_vectors = np.array(sent_vectors)
sent_vectors = np.nan_to_num(sent_vectors)
sent_vectors.shape


In [193]:
num_clus = [x for x in range(3,11)]
num_clus


# * Choosing the best cluster using Elbow Method.


In [194]:
squared_errors = []
for cluster in num_clus:
    kmeans = KMeans(n_clusters = cluster).fit(sent_vectors) # Train Cluster
    squared_errors.append(kmeans.inertia_) # Appending the squared loss obtained in the list
    
optimal_clusters = np.argmin(squared_errors) + 2 # As argmin return the index of minimum loss. 
plt.plot(num_clus, squared_errors)
plt.title("Elbow Curve to find the no. of clusters.")
plt.xlabel("Number of clusters.")
plt.ylabel("Squared Loss.")
xy = (optimal_clusters, min(squared_errors))
plt.annotate('(%s, %s)' % xy, xy = xy, textcoords='data')
plt.show()

print ("The optimal number of clusters obtained is - ", optimal_clusters)
print ("The loss for optimal cluster is - ", min(squared_errors))


In [195]:
model2 = KMeans(n_clusters = optimal_clusters)
model2.fit(sent_vectors)


In [196]:
word_cluster_pred=model2.predict(sent_vectors)
word_cluster_pred_2=model2.labels_
word_cluster_center=model2.cluster_centers_


In [197]:
word_cluster_center[1:2]


In [198]:
dfa = df3
dfa['AVG-W2V Clus Label'] = model2.labels_
dfa.head()


In [199]:
dfa.groupby(['AVG-W2V Clus Label'])['Short description'].count()


In [200]:
print('Review Assigned To Cluster ')
for i in range(optimal_clusters):
    print("A review of assigned to cluster ", i)
    print("-" * 70)
    print(dfa.iloc[dfa.groupby(['AVG-W2V Clus Label']).groups[i][0]]['Short description'])
    print('\n')
    print(dfa.iloc[dfa.groupby(['AVG-W2V Clus Label']).groups[i][1]]['Short description'])
    print('\n')
    print("_" * 70)


# Data Preprocessing and Visualization on first 1500 sample Set


In [201]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [202]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


In [203]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()


In [204]:
nRowsRead = 1500 # specify 'None' if want to read whole file
df1 = pd.read_csv('../input/age-dataset/AgeDataset-V1.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'project_catalog.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')


In [205]:
df1.head(5)


# Distribution graphs (histogram/bar graph) of sampled columns:


In [206]:
# Duplicates VALUE 

print(f'Duplicates in the dataset: {df1.duplicated().sum()}')
print(f'Percentage of duplicates: {df1.duplicated().sum()/len(df1)*100}%')


In [207]:
#Cardinality 
df1.nunique()


In [208]:
df1.info()


# Data Analysis


In [209]:
people_by_gender = df1.groupby(['Gender']).size().reset_index(name='Count')
people_by_gender.sort_values(by='Count', ascending=False)

In [210]:
sns.set_theme(style="whitegrid")
sns.set(rc={"figure.figsize":(10, 10)}) #width=8, height=4
ax = sns.barplot(x="Gender", y="Age of death", data=df1)


In [213]:
 plotCorrelationMatrix(df1, 8)


# 1) Scatter and density plots


In [214]:
plotScatterMatrix(df1, 20, 15)


# 2) Pairplots


In [215]:
 #sns.set(rc={'figure.figsize':(30,30)})

 sns.pairplot(df1, hue='Occupation')


In [216]:
sns.pairplot(df1, hue="Country", diag_kind="hist")


In [217]:
sns.pairplot(df1, hue="Gender", diag_kind="hist")


In [218]:
sns.pairplot(df1, hue="Manner of death", diag_kind="hist")


# 3) Heatmap

In [219]:
df1.corr()
plt.figure(figsize=(29,15))
sns.set_context('poster', font_scale=0.9)
sns.heatmap(df1.corr(), cmap='coolwarm', annot=True)
#plt.title('R')
plt.show()


In [220]:
sns.pairplot(df1, kind="kde")


# Function for finding co-relation


In [221]:
def corr_map(feature,size):  
  # Figure size
  plt.figure(figsize=size)
  sns.set_context('poster', font_scale= 1)

  # Histogram
  sns.histplot(data=df1, x=feature, hue='Gender', binwidth=1, kde=True)
  #sns.barh(data=df1, x=feature, hue='Gender', binwidth=1, kde=True)

  # Aesthetics
  plt.title(f'{feature} distribution')
  plt.xlabel(f'{feature} Value')


# Occupation

In [222]:
corr_map('Occupation',size=((70, 70)))


# Gender

In [223]:
corr_map('Gender',size=((10, 10)))


# Country

In [224]:
corr_map('Country',size=((30, 30)))


# Manner of death

In [225]:
corr_map('Manner of death',size=((50, 50)))
