<a href="https://colab.research.google.com/github/stanislawWojtas/Data-Engineering/blob/main/project01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [135]:
import pandas as pd
import numpy as np

### Exercise 1: Column information
Loading proj1_ex01.csv into a dataframe:

In [136]:
url = "proj1_ex01.csv"
df = pd.read_csv(url)
df

Unnamed: 0,First column,two,three,SOME;NAME,five,What is this even?,seven,eight
0,0.348554,-0.14509562920877161,-0.012336991474672475,9,red,good,quarrelsome,2016-05-26 09:33:42
1,-1.493853,0.12436946488785079,1.4611100361038865,4,red,bad,doctor,2016-12-03 18:55:52
2,-0.325891,,-0.42191202598625566,2,red,average,large,2016-05-15 11:49:26
3,-0.506596,0.3991147675939107,-0.26502607502330217,5,green,average,muddled,2015-01-30 22:33:29
4,,-0.6913144223047157,-0.26502607502330217,2,blue,good,coordinated,2015-11-20 00:15:35
5,0.527112,2.584347847701393,-0.26502607502330217XYZ,2,blue,good,separate,2017-11-17 09:58:54
6,-1.55529,unknown,-0.7732649697439955,5,green,bad,bright,2017-05-01 10:32:41


In [137]:
col_names = df.columns
col_names

Index(['First column', 'two', 'three', 'SOME;NAME', 'five',
       'What is this even?', 'seven', 'eight'],
      dtype='object')

In [138]:
missing_val = df.isnull().sum()/ df.shape[0]
missing_val = np.round(missing_val, 2)
missing_val

Unnamed: 0,0
First column,0.14
two,0.14
three,0.0
SOME;NAME,0.0
five,0.0
What is this even?,0.0
seven,0.0
eight,0.0


In [139]:
types = df.dtypes
types = types.astype(str)
types = types.replace("int64", "int")
types = types.replace("float64", "float")
types = types.replace("object", "other")
types

Unnamed: 0,0
First column,float
two,other
three,other
SOME;NAME,int
five,other
What is this even?,other
seven,other
eight,other


In [140]:
df_json = pd.DataFrame({"name": col_names, "missing": missing_val, "type": types})
df_json

Unnamed: 0,name,missing,type
First column,First column,0.14,float
two,two,0.14,other
three,three,0.0,other
SOME;NAME,SOME;NAME,0.0,int
five,five,0.0,other
What is this even?,What is this even?,0.0,other
seven,seven,0.0,other
eight,eight,0.0,other


In [141]:
df_json.to_json("proj1_ex01_fields.json", orient="records", indent=4)

### Exercise 2: Value statistics

In [142]:
# Computing statictics for all columns in dataframe
# dictionary for all stats
stats = {}
for col in df.columns:
  if(df[col].dtype == 'int64' or df[col].dtype == 'float64'):
    new_df = df[df[col] != 'NaN']
    count = int(new_df[col].count())
    mean = new_df[col].mean()
    std = new_df[col].std()
    min = new_df[col].min()
    max = new_df[col].max()
    at_75 = new_df[col].quantile(0.75)
    at_25 = new_df[col].quantile(0.25)
    col_stats = {"count": count, "mean": mean, "std": std, "min": min, "25%": at_25, "75%": at_75, "max": max}
    stats[col] = col_stats
  else:
    df_without_nan = df[df[col] != 'NaN']
    count = int(df_without_nan[col].count())
    unique_values = len(df[col].unique())
    # method mode() returns Pandas Series with most frequent values from most to leas frequent
    top = df[col].mode()[0]
    freq = len(df[df[col] == top])
    col_stats = {"count": count, "unique": unique_values, "top": top, "freq": freq}
    stats[col] = col_stats

stats

{'First column': {'count': 6,
  'mean': -0.5009940002009552,
  'std': 0.8839385203395562,
  'min': -1.55529041326908,
  '25%': -1.247038692513933,
  '75%': 0.1799426841401469,
  'max': 0.5271122588523375},
 'two': {'count': 6, 'unique': 7, 'top': '-0.14509562920877161', 'freq': 1},
 'three': {'count': 7, 'unique': 7, 'top': '-0.012336991474672475', 'freq': 1},
 'SOME;NAME': {'count': 7,
  'mean': 4.142857142857143,
  'std': 2.544836041121407,
  'min': 2,
  '25%': 2.0,
  '75%': 5.0,
  'max': 9},
 'five': {'count': 7, 'unique': 3, 'top': 'red', 'freq': 3},
 'What is this even?': {'count': 7, 'unique': 3, 'top': 'good', 'freq': 3},
 'seven': {'count': 7, 'unique': 7, 'top': 'bright', 'freq': 1},
 'eight': {'count': 7, 'unique': 7, 'top': '2015-01-30 22:33:29', 'freq': 1}}

In [143]:
# Saving to json file
import json
with open("proj1_ex02_stats.json", "w") as json_file:
  json.dump(stats, json_file, indent=4)

### Exercise 3: Column names

In [144]:
# Renaming column names
import re # for regular expressions
df.columns = [re.sub(r'[^A-Za-z0-9_ ]', '', col) for col in df.columns]
df = df.rename(columns=str.lower)
df = df.rename(columns= lambda x: x.replace(" ", "_"))
df

Unnamed: 0,first_column,two,three,somename,five,what_is_this_even,seven,eight
0,0.348554,-0.14509562920877161,-0.012336991474672475,9,red,good,quarrelsome,2016-05-26 09:33:42
1,-1.493853,0.12436946488785079,1.4611100361038865,4,red,bad,doctor,2016-12-03 18:55:52
2,-0.325891,,-0.42191202598625566,2,red,average,large,2016-05-15 11:49:26
3,-0.506596,0.3991147675939107,-0.26502607502330217,5,green,average,muddled,2015-01-30 22:33:29
4,,-0.6913144223047157,-0.26502607502330217,2,blue,good,coordinated,2015-11-20 00:15:35
5,0.527112,2.584347847701393,-0.26502607502330217XYZ,2,blue,good,separate,2017-11-17 09:58:54
6,-1.55529,unknown,-0.7732649697439955,5,green,bad,bright,2017-05-01 10:32:41


In [145]:
# Saving dataframe to json filed (without index)
df.to_csv("proj1_ex03_columns.csv", index=False)

### Exercise 4: Output formats

In [146]:
# Creating Excel file with headers but not the indexes
df.to_excel("proj1_ex04_excel.xlsx", header=True, index=False)

In [147]:
# Creating Json file which contains an array of rows stored as dictionaries, each with the DF column as key
df.to_json("proj1_ex04_json.json", orient="records", indent=4)

In [148]:
# Creating a pickle file
df.to_pickle("proj1_ex04_pickle.pkl")

### Exercise 5: Selecting rows and columns

In [149]:
# Reading data from pickle file
df = pd.read_pickle("proj1_ex05.pkl")
df

Unnamed: 0,name,description,age
v,V,Freedom fighter,
evey,Evey Hammond,Revolutionary,16.0
finch,Eric Finch,Police detective,40.0
creedy,Peter Creedy,Government official,49.0
gordon,Gordon Deitrich,Talk show host,38.0
valerie,Valerie Page,Actress,
delia,Delia Surridge,Medical researcher,50.0


In [150]:
#Selecting 2nd and 3rd column
df_selected_1 = df.iloc[:, 1:3]
df_selected_1

Unnamed: 0,description,age
v,Freedom fighter,
evey,Revolutionary,16.0
finch,Police detective,40.0
creedy,Government official,49.0
gordon,Talk show host,38.0
valerie,Actress,
delia,Medical researcher,50.0


In [151]:
#Selecting rows whose index begin with the letter v
df_selected_2 = df_selected_1[df_selected_1.index.str.startswith("v")]
df_selected_2

Unnamed: 0,description,age
v,Freedom fighter,
valerie,Actress,


In [152]:
#prevent nan from being printed
df_selected_2 = df_selected_2.replace(np.nan, "")
df_selected_2

Unnamed: 0,description,age
v,Freedom fighter,
valerie,Actress,


In [153]:
# save to markdown table
df_selected_2.to_markdown("proj_ex05_table.md")

### Exercise 6: Flattening data

In [154]:
# Loading json file to dataframe

with open("proj1_ex06.json", 'r') as file:
  data = json.load(file) #list of dictionaries
df = pd.json_normalize(data)
df

Unnamed: 0,brand,model,year,engine.type,engine.displacement,engine.power,engine.environmental.euro,engine.environmental.filter
0,Audi,A3,2022.0,Gasoline,1.4L,150 hp,,
1,Audi,2023,,Diesel,2.0L,190 hp,6.0,DPF
2,BMW,3 Series,2022.0,Hybrid,2.0L,288 hp,,
3,BMW,X7,2023.0,Gasoline,4.4L,456 hp,,
4,Mercedes-Benz,C-Class,2022.0,Diesel,2.0L,194 hp,,
5,Mercedes-Benz,GLE,2023.0,Hybrid,3.0L,362 hp,,


In [155]:
for col in df.columns:
  print(col)

brand
model
year
engine.type
engine.displacement
engine.power
engine.environmental.euro
engine.environmental.filter


In [156]:
# load into pickle file
df.to_pickle("proj01_ex06_pickle.pkl")