## Датафреймы Pandas

### Импортируем библиотеки

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as skl

### Создание датафрейма при помощи массивов

In [30]:
col = ['Name', 'Drink', 'Price']

d = [
    ['Helena', 'Cocktail', 150],
    ['Peter', 'Beer', 70],
    ['Nick', 'Beer', 210],
    ['Kate', 'Mojito', 100]
    ]

#ind = np.arange(1, len(d)+1)
df = pd.DataFrame(columns = col,
                  data=d,
                 # index = ind
                 )
df

Unnamed: 0,Name,Drink,Price
0,Helena,Cocktail,150
1,Peter,Beer,70
2,Nick,Beer,210
3,Kate,Mojito,100


### Действия с датафреймами

In [31]:
# Локализация столбцов
df0 = df[['Name', 'Price']]
df0

Unnamed: 0,Name,Price
0,Helena,150
1,Peter,70
2,Nick,210
3,Kate,100


In [32]:
# Локализация строк (по индексу)
df.loc[[2,3]]

# Локализация строк (по условию)
df.loc[(df['Price'] >= 100) & (df['Name'] != 'Nick')]

Unnamed: 0,Name,Drink,Price
0,Helena,Cocktail,150
3,Kate,Mojito,100


In [33]:
# Добавление столбцов
df['Taxi'] = ['yes', 'yes', 'no', 'yes']
df

Unnamed: 0,Name,Drink,Price,Taxi
0,Helena,Cocktail,150,yes
1,Peter,Beer,70,yes
2,Nick,Beer,210,no
3,Kate,Mojito,100,yes


In [34]:
# Добавление строк
df.loc[len(df)] = ['Maria', 'Juice', 50, 'no']
df

Unnamed: 0,Name,Drink,Price,Taxi
0,Helena,Cocktail,150,yes
1,Peter,Beer,70,yes
2,Nick,Beer,210,no
3,Kate,Mojito,100,yes
4,Maria,Juice,50,no


In [39]:
# Удаление столбцов
df.drop(columns = ['Taxi', 'Name'])

Unnamed: 0,Drink,Price
0,Cocktail,150
1,Beer,70
2,Beer,210
3,Mojito,100
4,Juice,50


In [42]:
# Удаление строк (по индексу)
df.drop([1,3])

Unnamed: 0,Name,Drink,Price,Taxi
0,Helena,Cocktail,150,yes
2,Nick,Beer,210,no
4,Maria,Juice,50,no


In [43]:
# Удаление строк (по условию)
df.loc[df['Drink'] != 'Beer']

Unnamed: 0,Name,Drink,Price,Taxi
0,Helena,Cocktail,150,yes
3,Kate,Mojito,100,yes
4,Maria,Juice,50,no


### Чтение датафреймов из файлов CSV

In [71]:
ds_titanic = pd.read_csv('titanic_train.csv', sep = ',')

In [72]:
# Пропуски данных меняем на 0
ds_titanic.fillna(0)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
647,0,3,Mr. Mito Mitkoff,male,23.0,0,0,7.8958
648,1,2,Miss. Elsie Doling,female,18.0,0,1,23.0000
649,0,3,Mr. Johannes Halvorsen Kalvik,male,21.0,0,0,8.4333
650,1,3,Miss. Hanora O'Leary,female,16.0,0,0,7.8292


In [73]:
#Замена строковых значений на числовые (метод replace)
ds_titanic = ds_titanic.replace({'Sex': {"male": 1, "female":0}})


### Создание массивов numpy из dataframe

In [88]:
X = ds_titanic.drop(columns = ['Name', 'Age']).to_numpy()
y = ds_titanic[["Age"]].to_numpy()

### Построение и обучение регрессионной модели

In [89]:
model=skl.LinearRegression()
model.fit(X,y)
model.score(X,y)

0.29038418359602425

### Прогноз целевой функции на новых данных

In [90]:
ds_test = pd.read_csv('titanic_test.csv', sep = ',')
ds_test.fillna(0)
ds_test = ds_test.replace({'Sex': {"male": 1, "female":0}})

In [91]:
X_test = ds_test.drop(columns = ['Name', 'Age']).to_numpy()
y_test = ds_test[["Age"]].to_numpy()
model.score(X_test,y_test)

0.19613266325507162

In [92]:
model.predict(X_test)

array([[29.45468477],
       [29.25758243],
       [23.59691677],
       [36.90688455],
       [39.91702785],
       [31.06457675],
       [29.26735722],
       [44.44714129],
       [29.26341117],
       [20.73456277],
       [29.45468477],
       [36.90688455],
       [29.25934271],
       [29.25533546],
       [34.90128523],
       [25.7408663 ],
       [40.7769608 ],
       [36.94331412],
       [31.66959243],
       [37.09631833],
       [29.25934271],
       [29.25533546],
       [23.35355154],
       [15.00849598],
       [30.49095898],
       [28.61567697],
       [38.46462103],
       [29.23821356],
       [ 9.01036353],
       [31.61654187],
       [29.95278514],
       [14.02693796],
       [29.22443152],
       [29.25903962],
       [34.2385753 ],
       [35.46680953],
       [21.67528418],
       [23.31209939],
       [29.26735722],
       [44.43311591],
       [36.89959863],
       [29.25533546],
       [23.38427479],
       [38.29275032],
       [29.26116419],
       [32

In [93]:
ds_test['Прогноз возраста'] = model.predict(X_test)

In [94]:
ds_test

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Прогноз возраста
0,0,2,Mr. Leonard Mark Hickman,1,24.0,2,0,73.5000,29.454685
1,0,3,Mr. Alexander Radeff,1,27.0,0,0,7.8958,29.257582
2,0,3,Mrs. John (Catherine) Bourke,0,32.0,1,1,15.5000,23.596917
3,0,2,Mr. George Floyd Eitemiller,1,23.0,0,0,13.0000,36.906885
4,0,1,Mr. Arthur Webster Newell,1,58.0,0,2,113.2750,39.917028
...,...,...,...,...,...,...,...,...,...
230,0,2,Rev. Juozas Montvila,1,27.0,0,0,13.0000,36.906885
231,1,1,Miss. Margaret Edith Graham,0,19.0,0,0,30.0000,38.507168
232,0,3,Miss. Catherine Helen Johnston,0,7.0,1,2,23.4500,21.854898
233,1,1,Mr. Karl Howell Behr,1,26.0,0,0,30.0000,39.145551
