Read the csv file and drop the unnecesary data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/kaggle/input/elementary-school-admission-romania-2014/elementary_school_registration_2014.csv', encoding = "ISO-8859-1")
df.drop(['app_type'], axis=1, inplace=True)
print(df.head())

Check for incomplete data

In [None]:
# CHECK FOR INCOMPLETE DATA
fig, ax = plt.subplots(figsize=(10,5))
sns.heatmap(df.isnull(), ax=ax)
plt.show()

Visualisation - pupils distribution, Urban vs Rural

In [None]:
sns.countplot(data=df, x='pupil_env', palette='rainbow')
plt.title('Pupils located Urban vs Rural')
plt.grid()
plt.legend(loc='upper right')
plt.show()

Visualisation - Pupils going to after school - Urban vs Rural

In [None]:
sns.countplot(x=df['after_school'], data=df, hue='pupil_env', palette='rainbow')
plt.title('Pupils going to after school Urban vs Rural')
plt.grid()
plt.legend(loc='upper right')
plt.show()

Visualisation - Pupils with single_parent Urban vs Rural

In [None]:
sns.countplot(x=df['single_parent'], data=df, hue='pupil_env', palette='rainbow')
plt.title('Pupils with single_parent Urban vs Rural')
plt.grid()
plt.legend(loc='upper right')
plt.show()

Visualisation - Pupils - orphan_institution - Urban vs Rural

In [None]:
sns.countplot(x=df['orphan_institution'], data=df, hue='pupil_env', palette='rainbow')
plt.title('Pupils - orphan_institution - Urban vs Rural')
plt.grid()
plt.legend(loc='upper right')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14,5))
sns.countplot(ax=ax, data=df, x='teaching_language')
plt.title('Pupils located Urban vs Rural')
plt.grid()
plt.legend(loc='upper right')
plt.show()

Clean some data and visualisation for - Pupils admision stage Urban vs Rural

In [None]:
df['admission_stage'][df['admission_stage']=='I-1'] = 'First 1'
df['admission_stage'][df['admission_stage']=='I-2'] = 'First 2'
df['admission_stage'][df['admission_stage']=='I-3'] = 'First 3'
df['admission_stage'][df['admission_stage']=='II'] = 'Second'
df['admission_stage'][df['admission_stage']=='I-1'] = 'Adjust'
sns.countplot(x=df['admission_stage'], data=df, hue='pupil_env', palette='PiYG_r')
plt.title('Pupils admision stage Urban vs Rural')
plt.grid()
plt.legend(loc='upper right')
plt.show()

Clean some data and visualisation for - Pupils teaching language (other than RO) Urban vs Rural

In [None]:
# ['Limba român?' 'Limba german?' 'Limba maghiar?' 'Limba slovac?', 'Limba rromani' 'Limba croat?' 'Limba sârb?' 'Limba ucrainean?']
df['teaching_language'][df['teaching_language']=='Limba român?'] = 'RO'
df['teaching_language'][df['teaching_language']=='Limba german?'] = 'DE'
df['teaching_language'][df['teaching_language']=='Limba maghiar?'] = 'HU'
df['teaching_language'][df['teaching_language']=='Limba slovac?'] = 'SK'
df['teaching_language'][df['teaching_language']=='Limba rromani'] = 'rromani'
df['teaching_language'][df['teaching_language']=='Limba croat?'] = 'CRO'
df['teaching_language'][df['teaching_language']=='Limba sârb?'] = 'SRB'
df['teaching_language'][df['teaching_language']=='Limba ucrainean?'] = 'UKR'
fig, ax = plt.subplots(figsize=(14,6))
sns.countplot(ax=ax, x=df['teaching_language'][df['teaching_language']!='RO'], data=df, hue='pupil_env', palette='PiYG_r')
plt.title('Pupils teaching language (other than RO) Urban vs Rural')
plt.grid()
plt.legend(loc='upper right')
plt.show()

Matrix - Urban and Rural pupils enviroment versus Urban and Rural schools. We can see daily migration of pupils from U to R or viceversa

In [None]:
df_env = df[['pupil_env', 'school_env']]
pt = df_env.pivot_table(index='pupil_env', columns='school_env', aggfunc='size')
sns.heatmap(data=pt, annot=True, fmt='g', cmap='RdBu', )
plt.show()

Merge main dataframe with the 'school network' csv file on 'SIRUES' code. Clean some data.

Visualisation - Pupils - school_category - Urban vs Rural

In [None]:
df2 = pd.read_csv('/kaggle/input/elementary-school-admission-romania-2014/school_network.csv', encoding = "ISO-8859-1")
print(df.head())
['no', 'judet', 'name', 'SIRUES', 'school_type', 'school_type2', 'school_category', 'education_form', 'teaching_language']
df = pd.merge(df, df2[['SIRUES', 'judet', 'school_category']], on='SIRUES')
['Gimnaziu ( clasele I-VIII sau I-X)' 'Grup ?colar' 'Colegiu' 'Liceu', '?coal? special?' 'Scoal? primar? (clasele I-IV)' 'Gradini??'
 'Alte ( casa corp didactic, inspectorat ?colar, etc)', 'Scoal? postliceal?']
df['school_category'][df['school_category']=='Gimnaziu ( clasele I-VIII sau I-X)'] = 'Gimnaziu'
df['school_category'][df['school_category']=='Grup ?colar'] = 'Grup scolar'
df['school_category'][df['school_category']=='?coal? special?'] = 'Scoala speciala'
df['school_category'][df['school_category']=='Scoal? primar? (clasele I-IV)'] = 'Scoala primara'
df['school_category'][df['school_category']=='Gradini??'] = 'Gradinita'
df['school_category'][df['school_category']=='Alte ( casa corp didactic, inspectorat ?colar, etc)'] = 'Alte'
df['school_category'][df['school_category']=='Scoal? postliceal?'] = 'Scoala postliceala'
fig, ax = plt.subplots(figsize=(14,6))
sns.countplot(ax=ax, x=df['school_category'], data=df, hue='pupil_env', palette='PiYG_r')
plt.title('Pupils - school_category - Urban vs Rural')
plt.grid()
plt.legend(loc='upper right')
plt.show()

Read the json file including counties coordinates. 
Add to main dataframe the 'id' field for matching data to json file


In [None]:
import json
from plotly.offline import plot
import plotly.express as px
with open('/kaggle/input/romania-counties/romania.geojson.json', 'r') as response:
    counties = json.load(response)

df_counties = pd.read_csv('/kaggle/input/romania-counties/counties_id.csv')
df_counties = df_counties.drop(['date','populatie'], axis=1)
df_counties = df_counties.rename(columns={'Region': 'judet'})
df['judet'][df['judet']=='Dâmbovita'] = 'Dambovita'
df['judet'][df['judet']=='Satu-Mare'] = 'Satu Mare'
df['judet'][df['judet']=='Vâlcea'] = 'Valcea'
df = df.merge(df_counties[['judet','id']], on='judet', how='inner',)
df['pupils_per_county'] = df.groupby('id')['judet'].transform('count')

Visualisation - Pupils distribution per counties

In [None]:
# Pupils distribution per counties
fig = px.choropleth_mapbox(df,
                           geojson=counties,
                           featureidkey=str("properties.cartodb_id"),
                           locations='id',
                           color='pupils_per_county',
                           hover_name='pupils_per_county',
                           hover_data={"pupils_per_county": False, 'id':False, "judet": True},
                           color_continuous_scale="greens",
                           color_continuous_midpoint=0,
                           range_color=(-5000,max(df['pupils_per_county'])),
                           mapbox_style="carto-darkmatter",
                           zoom=5.5, center={"lat": 46, "lon": 25.5},
                           opacity=0.8,
                           labels={'pupils_per_county': 'Pupils'},
                           # animation_frame='date',
                           )
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0},
    annotations=[
        dict(
            text='Pupils distribution per counties',
            font=dict(
                family='sans-serif',
                size=25,
                color='#222'
            ),
            bgcolor='#E2E2E2',
            bordercolor='#FFFFFF',
            borderwidth=2,
            x=0.02,
            y=0.9,
            xanchor="left",
            yanchor="bottom"
        )])
fig.show()

Create pivot table with ethnicity per counties and visualize the data

In [None]:
# Pupils distribution per counties
pt = df.pivot_table(index='judet', columns='ethnic', aggfunc='size', fill_value=0)
pt.drop(['Român?'], axis=1, inplace=True)
pt['Total']= pt.sum(axis=1)
pt = pt.merge(df_counties, on='judet')
fig = px.choropleth_mapbox(pt,
                           geojson=counties,
                           featureidkey=str("properties.cartodb_id"),
                           locations='id',
                           color='Total',
                           hover_name='Total',
                           hover_data={"Total": False, 'id':False, 'judet':True},
                           color_continuous_scale="greens",
                           color_continuous_midpoint=0,
                           range_color=(0,max(pt['Total'])),
                           mapbox_style="carto-darkmatter",
                           zoom=5.5, center={"lat": 46, "lon": 25.5},
                           opacity=0.8,
                           labels={'Total': 'Pupils'},
                           # animation_frame='date',
                           )
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0},
    annotations=[
        dict(
            text='Pupils other ethnicity distribution per counties',
            font=dict(
                family='sans-serif',
                size=25,
                color='#222'
            ),
            bgcolor='#E2E2E2',
            bordercolor='#FFFFFF',
            borderwidth=2,
            x=0.02,
            y=0.9,
            xanchor="left",
            yanchor="bottom"
        )])
fig.show()