In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!ls 'drive'

MyDrive


In [5]:
%cd 'drive/MyDrive/DataAnalysis'

/content/drive/MyDrive/DataAnalysis


In [43]:
import pandas as pd

houses = pd.read_csv('data/kc_house_data.csv')
titanic = pd.read_csv('data/titanic.csv')
netflix = pd.read_csv('data/netflix_titles.csv', sep='|', index_col=0)
btc = pd.read_csv('data/coin_Bitcoin.csv')
countries = pd.read_csv('data/world-happiness-report-2021.csv')

## Casting With astype()

In [44]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [45]:
titanic['age'].value_counts()

Unnamed: 0_level_0,count
age,Unnamed: 1_level_1
?,263
24,47
22,43
21,41
30,40
...,...
60.5,1
74,1
0.4167,1
11.5,1


**This line arise an error cause `age` is object it's not possible to convert it to number.**

In [46]:
titanic['age'].astype('float')

ValueError: could not convert string to float: '?'

In [47]:
titanic['age'].replace(['?'], [None], inplace=True)

In [49]:
titanic['age'].value_counts(dropna=False)

Unnamed: 0_level_0,count
age,Unnamed: 1_level_1
,263
24,47
22,43
21,41
30,40
...,...
60.5,1
74,1
0.4167,1
11.5,1


In [50]:
titanic['age'].astype('float')

Unnamed: 0,age
0,29.0000
1,0.9167
2,2.0000
3,30.0000
4,25.0000
...,...
1304,14.5000
1305,
1306,26.5000
1307,27.0000


In [52]:
titanic['age'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1309 entries, 0 to 1308
Series name: age
Non-Null Count  Dtype 
--------------  ----- 
1046 non-null   object
dtypes: object(1)
memory usage: 10.4+ KB


**`titanic['age'] is still object!`**

In [53]:
titanic['age'] = titanic['age'].astype(float)

In [54]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1309 non-null   object 
 9   cabin      1309 non-null   object 
 10  embarked   1309 non-null   object 
 11  boat       1309 non-null   object 
 12  body       1309 non-null   object 
 13  home.dest  1309 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 143.3+ KB


In [55]:
titanic.insert(1, 'float_age', titanic['age'])

In [56]:
titanic

Unnamed: 0,pclass,float_age,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,29.0000,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,0.9167,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,2.0000,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,30.0000,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,25.0000,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,14.5000,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,?,C,?,328,?
1305,3,,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,?,C,?,?,?
1306,3,26.5000,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.225,?,C,?,304,?
1307,3,27.0000,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.225,?,C,?,?,?


In [57]:
titanic.drop(columns='float_age', inplace=True)

In [58]:
titanic

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,?,C,?,328,?
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,?,C,?,?,?
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.225,?,C,?,304,?
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.225,?,C,?,?,?


In [59]:
titanic['age'].describe()

Unnamed: 0,age
count,1046.0
mean,29.881135
std,14.4135
min,0.1667
25%,21.0
50%,28.0
75%,39.0
max,80.0


In [62]:
titanic['sex'] = titanic['sex'].astype('category')

In [63]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   int64   
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1309 non-null   object  
 9   cabin      1309 non-null   object  
 10  embarked   1309 non-null   object  
 11  boat       1309 non-null   object  
 12  body       1309 non-null   object  
 13  home.dest  1309 non-null   object  
dtypes: category(1), float64(1), int64(4), object(8)
memory usage: 134.5+ KB


## Casting With `pd.to_numberic()`

In [9]:
titanic['age'].value_counts()

Unnamed: 0_level_0,count
age,Unnamed: 1_level_1
?,263
24,47
22,43
21,41
30,40
...,...
60.5,1
74,1
0.4167,1
11.5,1


In [11]:
titanic['age'].describe()

Unnamed: 0,age
count,1309
unique,99
top,?
freq,263


In [15]:
pd.to_numeric(titanic['age'], errors='coerce', downcast='float')

Unnamed: 0,age
0,29.0000
1,0.9167
2,2.0000
3,30.0000
4,25.0000
...,...
1304,14.5000
1305,
1306,26.5000
1307,27.0000


In [17]:
titanic['age'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1309 entries, 0 to 1308
Series name: age
Non-Null Count  Dtype 
--------------  ----- 
1309 non-null   object
dtypes: object(1)
memory usage: 10.4+ KB


**It's not set yet**

In [18]:
titanic['age'] = pd.to_numeric(titanic['age'], errors='coerce')

In [19]:
titanic['age']

Unnamed: 0,age
0,29.0000
1,0.9167
2,2.0000
3,30.0000
4,25.0000
...,...
1304,14.5000
1305,
1306,26.5000
1307,27.0000


In [20]:
titanic['age'].describe()

Unnamed: 0,age
count,1046.0
mean,29.881135
std,14.4135
min,0.1667
25%,21.0
50%,28.0
75%,39.0
max,80.0


## isna() & dropna()

In [21]:
stats = pd.read_csv('data/game_stats.csv')
stats.head()

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,,10.0,,2.0
2,stu,euroleague,,,
3,jackson,aba,9.0,,2.0
4,timothee,,8.0,,


In [26]:
stats.isna()

Unnamed: 0,name,league,points,assists,rebounds
0,False,False,False,False,False
1,False,True,False,True,False
2,False,False,True,True,True
3,False,False,False,True,False
4,False,True,False,True,True
5,False,False,False,False,False
6,True,True,True,True,True


In [28]:
stats['name'].isna()

Unnamed: 0,name
0,False
1,False
2,False
3,False
4,False
5,False
6,True


In [29]:
stats['name'].isna().sum()

np.int64(1)

In [30]:
stats['assists'].isna().sum()

np.int64(5)

In [31]:
stats.isna().sum()

Unnamed: 0,0
name,1
league,3
points,2
assists,5
rebounds,3


In [32]:
stats[stats['name'].isna()]

Unnamed: 0,name,league,points,assists,rebounds
6,,,,,


In [34]:
stats[stats['assists'].isna()]

Unnamed: 0,name,league,points,assists,rebounds
1,jessie,,10.0,,2.0
2,stu,euroleague,,,
3,jackson,aba,9.0,,2.0
4,timothee,,8.0,,
6,,,,,


In [36]:
stats['assists']

Unnamed: 0,assists
0,5.0
1,
2,
3,
4,
5,8.0
6,


In [35]:
stats['assists'].dropna()

Unnamed: 0,assists
0,5.0
5,8.0


In [37]:
assists = stats['assists']
assists.dropna(inplace=True)

In [38]:
assists

Unnamed: 0,assists
0,5.0
5,8.0


In [39]:
stats

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,,10.0,,2.0
2,stu,euroleague,,,
3,jackson,aba,9.0,,2.0
4,timothee,,8.0,,
5,steph,nba,49.0,8.0,10.0
6,,,,,


In [40]:
stats.dropna()

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
5,steph,nba,49.0,8.0,10.0
