# 1. Reshape

In [1]:
import numpy as np
x=np.arange(1,11,1)
x

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [2]:
x.shape

(10,)

In [3]:
np.reshape(x,(2,5))

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [4]:
np.reshape(x,(10,1))

array([[ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10]])

In [5]:
np.reshape(x,(-1,1))

array([[ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10]])

In [6]:
x.reshape((5,2))

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10]])

# 2. Performance

In [7]:
x = np.linspace(1,10000000, 20000000)
b0 = 5
b1 = 2

In [8]:
from time import time
y = np.empty_like(x)

inicio = time()

for index, value in enumerate(x):
    y[index] = b0+ b1*value

fin = time()

print(f"El tiempo total es: {fin - inicio}")

El tiempo total es: 6.312220096588135


In [9]:
inicio = time()
vect1 = np.ones_like(x).reshape((-1,1))
x = x.reshape((-1,1))
A = np.hstack([vect1,x])
betas = np.array([[b0],[b1]])

y1=np.matmul(A,betas)
fin = time()

print(f"El tiempo total es: {fin - inicio}")

El tiempo total es: 0.21624541282653809


# 3. Pandas (Panel Data)

In [10]:
import pandas as pd
data = np.random.rand(10)
data

array([0.99284089, 0.46172977, 0.01510003, 0.29186295, 0.39702552,
       0.13483341, 0.70178824, 0.31001332, 0.4374365 , 0.3910326 ])

In [11]:
pd.Series(data, index=np.arange(10,110,10))

10     0.992841
20     0.461730
30     0.015100
40     0.291863
50     0.397026
60     0.134833
70     0.701788
80     0.310013
90     0.437436
100    0.391033
dtype: float64

In [12]:
diccionario = { 'A': 30, 'B': 60, 'C': 'Hola a todos', 'D': 156.5645 }
pd.Series(diccionario)

A              30
B              60
C    Hola a todos
D        156.5645
dtype: object

In [13]:
d = {
    'columnaA' : [10,20,30,40],
    'columnaB' : pd.Series(['abc',10.5,34,'xy']),
    'columnaC' : pd.Series([10.45,89.34,90.76,24.89])
}

df1 = pd.DataFrame(d)
df1

Unnamed: 0,columnaA,columnaB,columnaC
0,10,abc,10.45
1,20,10.5,89.34
2,30,34,90.76
3,40,xy,24.89


In [14]:
type(df1['columnaB'])

pandas.core.series.Series

In [15]:
df1.columnaA

0    10
1    20
2    30
3    40
Name: columnaA, dtype: int64

In [16]:
df1['columnaA']

0    10
1    20
2    30
3    40
Name: columnaA, dtype: int64

In [17]:
df1[['columnaC','columnaA']]

Unnamed: 0,columnaC,columnaA
0,10.45,10
1,89.34,20
2,90.76,30
3,24.89,40


In [18]:
df1.loc[[0,1,3],['columnaC','columnaA']]

Unnamed: 0,columnaC,columnaA
0,10.45,10
1,89.34,20
3,24.89,40


In [19]:
df1.iloc[[0,1,3],[2,0]]

Unnamed: 0,columnaC,columnaA
0,10.45,10
1,89.34,20
3,24.89,40


In [20]:
df1['nueva'] = df1['columnaA'] + df1['columnaC']
df1

Unnamed: 0,columnaA,columnaB,columnaC,nueva
0,10,abc,10.45,20.45
1,20,10.5,89.34,109.34
2,30,34,90.76,120.76
3,40,xy,24.89,64.89


In [21]:
df1.drop(0,axis=0)

Unnamed: 0,columnaA,columnaB,columnaC,nueva
1,20,10.5,89.34,109.34
2,30,34,90.76,120.76
3,40,xy,24.89,64.89


In [22]:
df1.drop('columnaB', axis=1)

Unnamed: 0,columnaA,columnaC,nueva
0,10,10.45,20.45
1,20,89.34,109.34
2,30,90.76,120.76
3,40,24.89,64.89


In [23]:
df1.drop(3, axis=0, inplace=True)
df1

Unnamed: 0,columnaA,columnaB,columnaC,nueva
0,10,abc,10.45,20.45
1,20,10.5,89.34,109.34
2,30,34,90.76,120.76


## 3.1 Filtros

In [24]:
df2 = pd.DataFrame({
    'A': np.random.randint(1, 20, size=10),
    'B': np.random.randint(1, 20, size=10)
})
df2

Unnamed: 0,A,B
0,6,6
1,12,11
2,14,7
3,12,3
4,10,9
5,10,1
6,8,5
7,7,6
8,14,19
9,18,17


In [25]:
df2[df2['A']>7]

Unnamed: 0,A,B
1,12,11
2,14,7
3,12,3
4,10,9
5,10,1
6,8,5
8,14,19
9,18,17


In [26]:
df2[(df2['A']>7) & (df2['A']<15)]

Unnamed: 0,A,B
1,12,11
2,14,7
3,12,3
4,10,9
5,10,1
6,8,5
8,14,19


## 3.2 Operaciones de Agregación

In [27]:
df = pd.DataFrame({'col1': [10,20,30,40,50,60], 
                   'col2':[1,2,3,4,5,6], 
                   'col3':['A', 'B', 'C', 'D', 'A', 'F']})
df


Unnamed: 0,col1,col2,col3
0,10,1,A
1,20,2,B
2,30,3,C
3,40,4,D
4,50,5,A
5,60,6,F


In [28]:
df['col1'].sum()

210

In [34]:
df[['col1','col2']].sum(axis=1)

0    11
1    22
2    33
3    44
4    55
5    66
dtype: int64

In [33]:
df.sum(axis=0)

col1       210
col2        21
col3    ABCDAF
dtype: object

In [35]:
df['col2'].count()

6

In [36]:
df['col3'].value_counts()

col3
A    2
B    1
C    1
D    1
F    1
Name: count, dtype: int64

In [37]:
df['col2'].values

array([1, 2, 3, 4, 5, 6])

In [38]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [39]:
df.columns.values

array(['col1', 'col2', 'col3'], dtype=object)

## Analytics con Pandas

In [42]:
dataset = pd.read_csv("hotel_bookings.csv")
dataset.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,...,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,...,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,...,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,...,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22


In [41]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [43]:
dataset.shape

(119390, 32)

In [44]:
dataset.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [45]:
dataset.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,119390.0,103050.0,6797.0,119390.0,119390.0,119390.0,119390.0
mean,0.370416,104.011416,2016.156554,27.165173,15.798241,0.927599,2.500302,1.856403,0.10389,0.007949,0.031912,0.087118,0.137097,0.221124,86.693382,189.266735,2.321149,101.831122,0.062518,0.571363
std,0.482918,106.863097,0.707476,13.605138,8.780829,0.998613,1.908286,0.579261,0.398561,0.097436,0.175767,0.844336,1.497437,0.652306,110.774548,131.655015,17.594721,50.53579,0.245291,0.792798
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.29,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.575,0.0,0.0
75%,1.0,160.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


In [46]:
dataset['is_canceled'].value_counts()

is_canceled
0    75166
1    44224
Name: count, dtype: int64

In [47]:
dataset['lead_time'].value_counts()

lead_time
0      6345
1      3460
2      2069
3      1816
4      1715
       ... 
400       1
370       1
532       1
371       1
463       1
Name: count, Length: 479, dtype: int64