### Context
The Axial Age dataset tracks a variety of sociopolitical norms and their development across key areas in Afro-Eurasia. The specific scores for each sociopolitical norm for each date (varying time spans between 5300 BCE and 1800 CE in 100 year increments) within 10 NGAs (natural geographic area) were agreed-upon by a group of experts and compiled into the dataset.

![](https://images-na.ssl-images-amazon.com/images/I/71nLQuk865L._RI_.jpg)

# 1. import packages and theme

In [None]:
# data manipulation
import pandas as pd 
import numpy as np
import os

# data vizualisation 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# dataprep
!pip install dataprep
from dataprep.eda import *
from dataprep.datasets import load_dataset
from dataprep.eda import create_report
from dataprep.eda import plot
from dataprep.eda import plot_correlation
from dataprep.eda.missing import plot_missing

#default theme
sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize'] =[15,10]
matplotlib.rcParams.update({'font.size': 15})

import warnings
warnings.filterwarnings("ignore")

# 2. data analysis

In [None]:
df=pd.read_csv('../input/axial-age-dataset/AxialAgeDataset.csv')
df

In [None]:
print('we had ',df.shape[0],'columns and ',df.shape[1],'rows')

In [None]:
df.columns

#### like we can see the names of our columns Overlapping and disorganized so we gonna change the columns names by creating a function that will :
* removes numbers and periods, 
* replaces spaces and dashes with underscores, and lowercases letters
* for easier coding and top keep text in compliance with PEP-8 formatting.
    
    * Parameters:
        INPUTS:
        str (str): This is the text to be preprocessed.
    
    * OUTPUTS:
        str : This is the fully preprocessed string.
    

In [None]:
# Create a text preprocessing function

import re

def text_preprocess(str):

    
    str = str.lstrip('0123456789. ')
    str = re.sub(r"\s+", '_', str)
    str = re.sub(r"[\-.]", '_', str)
    str = str.lower()
    return str

# of the nga column.
df.columns = df.columns.to_series().apply(text_preprocess)
df["nga"] = df["nga"].map(text_preprocess)

# Check that the preprocessing worked.
print("RegExed Columns:\n" + str(df.columns))
print("\nRegExed NGAs:\n" + str(df.nga.unique()))

In [None]:
df.info()

In [None]:
df.dtypes

### we gonna visualate our data types in a plot to facilate vision 

In [None]:
plot(df.dtypes.value_counts())

In [None]:
fig, axarr = plt.subplots(1, 2, figsize=(20, 8))

df.dtypes.value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct='%1.1f%%',shadow=True,ax=axarr[1])
axarr[1].set_title("type of our data ", fontsize=18)

df.dtypes.value_counts().plot(kind='bar',ax=axarr[0])
plt.title('type of our data');
axarr[0].set_title("type of our data ", fontsize=18)

#### like **94% of our data are numirical** and **6% are categorical** 

In [None]:
df.describe(include='all')

# 3. findig and handling with missing values

In [None]:
miss_val = df.isnull().sum()
miss_per = miss_val/df.shape[0]*100

data={
    'missing values':miss_val,
    'missing values by %':miss_per,
    'type':df.dtypes
}
frame = pd.DataFrame(data)
frame

In [None]:
plot_missing(df)

our missing value are hard to change it with traditional methode so we gonna remplace them all with 0 

In [None]:
df=df.fillna(0)

In [None]:
miss_val = df.isnull().sum()
miss_per = miss_val/df.shape[0]*100

data={
    'missing values':miss_val,
    'missing values by %':miss_per,
    'type':df.dtypes
}
frame = pd.DataFrame(data)
frame

lets check our data describtion after cleaning 

In [None]:
df.describe(include='all')

# 4. data visualization 

In [None]:
df.hist(figsize=(20,15),edgecolor='black');

# A. date

In [None]:
plt.figure(figsize=(25,10))
sns.countplot(df.date_from)
plt.xticks(rotation='90')
plt.title('Historical years')

#### Relation between date and nga :

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(data=df, y="date_from",x='nga')
plt.title('Relation betwwen time and nga');

#### Relation between time and other features

In [None]:
plot(df,'date_from')

In [None]:

# Plot total observations by feature over time.
df_date_list = df.date_from.unique()
df_date_list = sorted(df_date_list)
col_list = (list(df))
new_col_list = (col_list[2:-1])

# Create a dataframe for the plot.
feature_adoption=[]
for i in df_date_list:
    date_dict = {}
    date_dict["year"] = i
    df_years = df.loc[df['date_from'] == i]
    for j in new_col_list:
        df_feature = df_years[["date_from", j]]
        total = df_feature[j].sum()
        date_dict[j] = total
    feature_adoption.append(date_dict)    
adoptiondf = pd.DataFrame(feature_adoption)

# Plot subplots of features over time. 
fig = plt.figure(figsize=(20,30))
num = 1
for i in range(1,12):    
    ax = fig.add_subplot(6,2,num)
    adoptiondf.plot(x='year', y=new_col_list[i-1], 
                    ax = ax, legend = False, color = "green")
    plt.ylim(0, 11)
    num += 1  
    

## B. nga

In [None]:
df.nga.value_counts()

In [None]:
fig, axarr = plt.subplots(1, 2, figsize=(30, 8))

df.nga.value_counts().plot.pie(autopct='%1.1f%%',shadow=True,ax=axarr[1])
axarr[1].set_title("counts of nga", fontsize=18)

sns.countplot(df.nga,ax=axarr[0])
axarr[0].set_title("counts of nga", fontsize=18)
plt.show()

In [None]:
plot(df,"nga")

### now we gonna see some distrubation

In [None]:
plt.figure(figsize=(20,10))
sns.FacetGrid(df,hue="nga", height=8,xlim = (0,20)).map(sns.kdeplot, "sum").add_legend()
plt.title('Sum Distribution of NGA',fontsize=15);

plt.show();

### Relation betwenn nga & sum 

In [None]:
plt.figure(figsize=(20,10))
sns.catplot(x="nga", y="sum", kind="box",data=df.sort_values("nga"))
plt.title('NGA vs Sum',fontsize=15)
plt.xticks(rotation=90)
plt.show()

In [None]:
create_report(df)