<a href="https://colab.research.google.com/github/strzelnat/machine_learning_study/blob/main/supervised/basics/_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
 !pip install scikit-learn



In [3]:
import pandas as pd
import numpy as np
import sklearn

sklearn.__version__

'1.6.1'

Data loading

In [7]:
def fetch_financial_data(company = 'AMZN'):
  """
    This function fetches stock market quotations.
  """
  import pandas_datareader.data as web
  return web.DataReader(name = company, data_source = 'stooq')

df_raw = fetch_financial_data()
df_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-07-11,223.58,226.6799,222.37,225.02,50518307
2025-07-10,221.55,222.79,219.7,222.26,30370591
2025-07-09,221.07,224.29,220.47,222.54,38155121
2025-07-08,223.915,224.0,218.43,219.36,45691987
2025-07-07,223.0,224.29,222.37,223.47,36604139


Copy of data

In [8]:
df = df_raw.copy()
df = df[:5]
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-07-11,223.58,226.6799,222.37,225.02,50518307
2025-07-10,221.55,222.79,219.7,222.26,30370591
2025-07-09,221.07,224.29,220.47,222.54,38155121
2025-07-08,223.915,224.0,218.43,219.36,45691987
2025-07-07,223.0,224.29,222.37,223.47,36604139


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2025-07-11 to 2025-07-07
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      int64  
dtypes: float64(4), int64(1)
memory usage: 240.0 bytes


Generate new variables

In [12]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year

df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-07-11,223.58,226.6799,222.37,225.02,50518307,11,7,2025
2025-07-10,221.55,222.79,219.7,222.26,30370591,10,7,2025
2025-07-09,221.07,224.29,220.47,222.54,38155121,9,7,2025
2025-07-08,223.915,224.0,218.43,219.36,45691987,8,7,2025
2025-07-07,223.0,224.29,222.37,223.47,36604139,7,7,2025


Bins for variable

In [13]:
test = pd.DataFrame(data = {'weight':[65, 55, 40, 72, 80, 66, 68, 82, 38]})
test

Unnamed: 0,weight
0,65
1,55
2,40
3,72
4,80
5,66
6,68
7,82
8,38


In [23]:
test['w1'] = pd.cut(x = test.weight, bins = 6)
test

Unnamed: 0,weight,w1
0,65,"(60.0, 67.333]"
1,55,"(52.667, 60.0]"
2,40,"(37.956, 45.333]"
3,72,"(67.333, 74.667]"
4,80,"(74.667, 82.0]"
5,66,"(60.0, 67.333]"
6,68,"(67.333, 74.667]"
7,82,"(74.667, 82.0]"
8,38,"(37.956, 45.333]"


In [24]:
test['w1'] = pd.cut(x = test.weight, bins = (30, 60, 80, 90))
test

Unnamed: 0,weight,w1
0,65,"(60, 80]"
1,55,"(30, 60]"
2,40,"(30, 60]"
3,72,"(60, 80]"
4,80,"(60, 80]"
5,66,"(60, 80]"
6,68,"(60, 80]"
7,82,"(80, 90]"
8,38,"(30, 60]"


In [25]:
test['w1'] = pd.cut(x = test.weight, bins = (30, 60, 80, 90), labels = ['thin', 'normal', 'owerweight'])
test

Unnamed: 0,weight,w1
0,65,normal
1,55,thin
2,40,thin
3,72,normal
4,80,normal
5,66,normal
6,68,normal
7,82,owerweight
8,38,thin


In [26]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-07-11,223.58,226.6799,222.37,225.02,50518307,11,7,2025
2025-07-10,221.55,222.79,219.7,222.26,30370591,10,7,2025
2025-07-09,221.07,224.29,220.47,222.54,38155121,9,7,2025
2025-07-08,223.915,224.0,218.43,219.36,45691987,8,7,2025
2025-07-07,223.0,224.29,222.37,223.47,36604139,7,7,2025


In [27]:
df = pd.DataFrame(data = {'lang': [['PL','ENG'],['GER','ENG','PL','FRA'],['RUS','ANG']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,"[RUS, ANG]"


In [28]:
df['lenght'] = df['lang'].apply(len)

In [29]:
df

Unnamed: 0,lang,lenght
0,"[PL, ENG]",2
1,"[GER, ENG, PL, FRA]",4
2,"[RUS, ANG]",2


In [31]:
df['PL_flag'] = df['lang'].apply(lambda x: 1 if 'PL' in x else 0)
df

Unnamed: 0,lang,lenght,PL_flag
0,"[PL, ENG]",2,1
1,"[GER, ENG, PL, FRA]",4,1
2,"[RUS, ANG]",2,0


In [32]:
df['ANG_flag'] = df['lang'].apply(lambda x: 1 if 'ANG' in x else 0)
df

Unnamed: 0,lang,lenght,PL_flag,ANG_flag
0,"[PL, ENG]",2,1,0
1,"[GER, ENG, PL, FRA]",4,1,0
2,"[RUS, ANG]",2,0,1


In [33]:
df['GER_flag'] = df['lang'].apply(lambda x: 1 if 'GER' in x else 0)
df

Unnamed: 0,lang,lenght,PL_flag,ANG_flag,GER_flag
0,"[PL, ENG]",2,1,0,0
1,"[GER, ENG, PL, FRA]",4,1,0,1
2,"[RUS, ANG]",2,0,1,0


In [35]:
df = pd.DataFrame(data = {'websites': ['wp.pl', 'onet.pl', 'google.com']})
df

Unnamed: 0,websites
0,wp.pl
1,onet.pl
2,google.com


In [39]:
df = df.websites.str.split('.', expand = True)

In [44]:
df['extension'] = df[1]
df = df.drop(columns = [0, 1])
df

Unnamed: 0,portal,extension
0,wp,pl
1,onet,pl
2,google,com
