In [1]:
import numpy as np
import pandas as pd
import json
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
with open('data-text.json', 'r') as file:
  data = json.load(file)

df = pd.DataFrame(data)
df.to_csv('JsonToDataFrame.csv')
df

Unnamed: 0,Indicator,PUBLISH STATES,Year,WHO region,World Bank income group,Country,Sex,Display Value,Numeric,Low,High,Comments
0,Life expectancy at birth (years),Published,1990,Europe,High-income,Andorra,Both sexes,77,77.0,,,
1,Life expectancy at birth (years),Published,2000,Europe,High-income,Andorra,Both sexes,80,80.0,,,
2,Life expectancy at age 60 (years),Published,2012,Europe,High-income,Andorra,Female,28,28.0,,,
3,Life expectancy at age 60 (years),Published,2000,Europe,High-income,Andorra,Both sexes,23,23.0,,,
4,Life expectancy at birth (years),Published,2012,Eastern Mediterranean,High-income,United Arab Emirates,Female,78,78.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
4651,Healthy life expectancy (HALE) at birth (years),Published,2012,Western Pacific,Lower-middle-income,Samoa,Female,66,66.0,,,
4652,Healthy life expectancy (HALE) at birth (years),Published,2012,Eastern Mediterranean,Low-income,Yemen,Both sexes,54,54.0,,,
4653,Healthy life expectancy (HALE) at birth (years),Published,2000,Africa,Upper-middle-income,South Africa,Male,49,49.0,,,
4654,Healthy life expectancy (HALE) at birth (years),Published,2000,Africa,Low-income,Zambia,Both sexes,36,36.0,,,


In [3]:
with open('data-text.xml', 'r') as file:
    contents = file.read()

soup = BeautifulSoup(contents, 'xml')

In [4]:
att_label = []
att_EntityType = []
att_Entity = []
Display = []

for attr in soup.find_all('Attribute'):
  att_label.append(attr["Label"])
  att_EntityType.append(attr["EntityType"])
  att_Entity.append(attr["Entity"])
  Display.append(attr.find("Display").text)

dataframe = {
    "Label": att_label,
    "EntityType":att_EntityType,
    "Entity": att_Entity,
}

df = pd.DataFrame(dataframe)
df.head(5)

Unnamed: 0,Label,EntityType,Entity
0,DS,CORE_DIMENSION,COUNTRY
1,FIPS,CORE_DIMENSION,COUNTRY
2,IOC,CORE_DIMENSION,COUNTRY
3,ISO2,CORE_DIMENSION,COUNTRY
4,ISO,CORE_DIMENSION,COUNTRY


Regression PART

In [5]:
df = pd.read_csv("iris.csv")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [8]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [9]:
df.dropna()
df.drop_duplicates()
df.fillna(0)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [11]:
df.drop(columns=['species'], inplace=True)

In [13]:
X = df.drop(columns=["petal_width"])
y = df["petal_width"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = LinearRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

print()
print(X_train.corr())

Mean Absolute Error: 0.1693268164410859
Mean Squared Error: 0.051607270608851764
Root Mean Squared Error: 0.22717233680369572

              sepal_length  sepal_width  petal_length
sepal_length      1.000000    -0.060566      0.875775
sepal_width      -0.060566     1.000000     -0.377609
petal_length      0.875775    -0.377609      1.000000
