## 1 - Loading the Datasets

### Dataset:
Heart Disease UCI - https://archive.ics.uci.edu/ml/datasets/Heart+Disease


In [4]:
# Importing the Libraries

import numpy as np
import pandas as pd
import pickle

filename = "heart.csv"

In [5]:
# Pandas - Read_csv

df = pd.read_csv(filename)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
# Using Numpy's LoadTXT

data = np.loadtxt(filename, delimiter= ",", skiprows=1) # Everythong will be a Float., Skiprows is for Removing Header
print(data) # It Gives a Matrix

[[63.  1.  3. ...  0.  1.  1.]
 [37.  1.  2. ...  0.  2.  1.]
 [41.  0.  1. ...  0.  2.  1.]
 ...
 [68.  1.  0. ...  2.  3.  0.]
 [57.  1.  0. ...  1.  3.  0.]
 [57.  0.  1. ...  1.  2.  0.]]


In [7]:
# Using Numpy's GenfromText

# You can use names to obtain column name, dtype None means itll directly infer from data
data = np.genfromtxt(filename, delimiter= ',', dtype= None, names= True, encoding= 'utf-8-sig')
print(data)
print(data.dtype)
# You will get a List of Tuple; i4 - Integer; f8 - Float (8 byte Float)

[(63, 1, 3, 145, 233, 1, 0, 150, 0, 2.3, 0, 0, 1, 1)
 (37, 1, 2, 130, 250, 0, 1, 187, 0, 3.5, 0, 0, 2, 1)
 (41, 0, 1, 130, 204, 0, 0, 172, 0, 1.4, 2, 0, 2, 1)
 (56, 1, 1, 120, 236, 0, 1, 178, 0, 0.8, 2, 0, 2, 1)
 (57, 0, 0, 120, 354, 0, 1, 163, 1, 0.6, 2, 0, 2, 1)
 (57, 1, 0, 140, 192, 0, 1, 148, 0, 0.4, 1, 0, 1, 1)
 (56, 0, 1, 140, 294, 0, 0, 153, 0, 1.3, 1, 0, 2, 1)
 (44, 1, 1, 120, 263, 0, 1, 173, 0, 0. , 2, 0, 3, 1)
 (52, 1, 2, 172, 199, 1, 1, 162, 0, 0.5, 2, 0, 3, 1)
 (57, 1, 2, 150, 168, 0, 1, 174, 0, 1.6, 2, 0, 2, 1)
 (54, 1, 0, 140, 239, 0, 1, 160, 0, 1.2, 2, 0, 2, 1)
 (48, 0, 2, 130, 275, 0, 1, 139, 0, 0.2, 2, 0, 2, 1)
 (49, 1, 1, 130, 266, 0, 1, 171, 0, 0.6, 2, 0, 2, 1)
 (64, 1, 3, 110, 211, 0, 0, 144, 1, 1.8, 1, 0, 2, 1)
 (58, 0, 3, 150, 283, 1, 0, 162, 0, 1. , 2, 0, 2, 1)
 (50, 0, 2, 120, 219, 0, 1, 158, 0, 1.6, 1, 0, 2, 1)
 (58, 0, 2, 120, 340, 0, 1, 172, 0, 0. , 2, 0, 2, 1)
 (66, 0, 3, 150, 226, 0, 1, 114, 0, 2.6, 0, 0, 2, 1)
 (43, 1, 0, 150, 247, 0, 1, 171, 0, 1.5, 2, 0,

In [8]:
# Loading a file Manually
# This is a General way to read a Sphagetti File (Different no of Columns on Different Rows)


def load_file(filename):
    with open(filename, encoding= 'utf-8-sig') as f:
        data, cols = [],[]
        for i, line in enumerate(f.read().splitlines()): # Use this instead of readlines as it doesnt split sometimes properly
            # If Index is 0, then column
            if i == 0:
                cols += line.split(',')
            else:
                # Use Float for Simplicity
                data.append([float(x) for x in line.split(',')])
            
            # Make it as a Dataframe
            df = pd.DataFrame(data, columns= cols)
    
    return df

In [9]:
# Try manually built Function

load_file(filename)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,3.0,145.0,233.0,1.0,0.0,150.0,0.0,2.3,0.0,0.0,1.0,1.0
1,37.0,1.0,2.0,130.0,250.0,0.0,1.0,187.0,0.0,3.5,0.0,0.0,2.0,1.0
2,41.0,0.0,1.0,130.0,204.0,0.0,0.0,172.0,0.0,1.4,2.0,0.0,2.0,1.0
3,56.0,1.0,1.0,120.0,236.0,0.0,1.0,178.0,0.0,0.8,2.0,0.0,2.0,1.0
4,57.0,0.0,0.0,120.0,354.0,0.0,1.0,163.0,1.0,0.6,2.0,0.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57.0,0.0,0.0,140.0,241.0,0.0,1.0,123.0,1.0,0.2,1.0,0.0,3.0,0.0
299,45.0,1.0,3.0,110.0,264.0,0.0,1.0,132.0,0.0,1.2,1.0,0.0,3.0,0.0
300,68.0,1.0,0.0,144.0,193.0,1.0,1.0,141.0,0.0,3.4,1.0,2.0,3.0,0.0
301,57.0,1.0,0.0,130.0,131.0,0.0,1.0,115.0,1.0,1.2,1.0,1.0,3.0,0.0


In [None]:
# Using Pickles

# Pickles are Sometimes Dangerous as Encoding changes. You can use hdf5 instead

pdf.read_pkl('xxx.pkl')

### Data Conversions

In [14]:
df = pd.read_csv(filename)

# Convert DataFrame to Numpy. Beware Sometimes Numpy may modify the original data (Dataframe). So use .copy()
data = df.to_numpy()
print(data[:1])
print(type(data))

# Another Method - Depriciated, Dont Use
data = df.values
print(data[:1])

# Numpy converts all values to Float64

[[ 63.    1.    3.  145.  233.    1.    0.  150.    0.    2.3   0.    0.
    1.    1. ]]
<class 'numpy.ndarray'>
[[ 63.    1.    3.  145.  233.    1.    0.  150.    0.    2.3   0.    0.
    1.    1. ]]


### Saving & Serializing DataFrame

In [15]:
# Create a Random DataFrame

df = pd.DataFrame(np.random.random(size=(100000,4)), columns= ['A', 'B', 'C', 'D'])

df.head()

Unnamed: 0,A,B,C,D
0,0.73016,0.453014,0.006047,0.210602
1,0.186176,0.771055,0.91301,0.087061
2,0.601354,0.919407,0.979665,0.625264
3,0.487094,0.533641,0.027162,0.496143
4,0.443925,0.079541,0.713986,0.64149


In [None]:
# Save the data with DataFrame

df.to_csv('filename.csv', index= False, float_format= "%0.4f") # Max 4 Digit for Float, and Remove Index rows

In [None]:
# Save as pickle
df.to_pkl('data.pkl')

In [None]:
# For Big Data - HDF5 (Hierarchical Data Format)
# HDF Requires you to have Table Installed

# !pip install tables
df.to_hdf('data.hdf5', key= 'data', format= 'table')

### Trying for String and Categorical Data

In [18]:
df = pd.read_csv('astronauts.csv')
df.head()

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,Military Rank,Military Branch,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr),Missions,Death Date,Death Mission
0,Joseph M. Acaba,2004.0,19.0,Active,5/17/1967,"Inglewood, CA",Male,University of California-Santa Barbara; Univer...,Geology,Geology,,,2,3307,2,13.0,"STS-119 (Discovery), ISS-31/32 (Soyuz)",,
1,Loren W. Acton,,,Retired,3/7/1936,"Lewiston, MT",Male,Montana State University; University of Colorado,Engineering Physics,Solar Physics,,,1,190,0,0.0,STS 51-F (Challenger),,
2,James C. Adamson,1984.0,10.0,Retired,3/3/1946,"Warsaw, NY",Male,US Military Academy; Princeton University,Engineering,Aerospace Engineering,Colonel,US Army (Retired),2,334,0,0.0,"STS-28 (Columbia), STS-43 (Atlantis)",,
3,Thomas D. Akers,1987.0,12.0,Retired,5/20/1951,"St. Louis, MO",Male,University of Missouri-Rolla,Applied Mathematics,Applied Mathematics,Colonel,US Air Force (Retired),4,814,4,29.0,"STS-41 (Discovery), STS-49 (Endeavor), STS-61 ...",,
4,Buzz Aldrin,1963.0,3.0,Retired,1/20/1930,"Montclair, NJ",Male,US Military Academy; MIT,Mechanical Engineering,Astronautics,Colonel,US Air Force (Retired),2,289,2,8.0,"Gemini 12, Apollo 11",,


In [19]:
# Getting the Disk Space Details

%ls

 Volume in drive C has no label.
 Volume Serial Number is 82A3-F7EB

 Directory of C:\Users\Ravi\Documents\Data_Science_Courses\Data Manupulation in Python\1 - Datasets Basics\Files

28-05-2020  22:39    <DIR>          .
28-05-2020  22:39    <DIR>          ..
28-05-2020  21:37    <DIR>          .ipynb_checkpoints
28-05-2020  22:39            50,155 1 - Datasets Introduction.ipynb
20-09-2019  05:34            81,593 astronauts.csv
01-10-2019  15:45            11,328 heart.csv
               3 File(s)        143,076 bytes
               3 Dir(s)  15,932,559,360 bytes free


### Inspecting the Data

In [20]:
# Get tail Snapshot for 2 data
df.tail(2)

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,Military Rank,Military Branch,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr),Missions,Death Date,Death Mission
355,John W. Young,1962.0,2.0,Retired,9/24/1930,"San Francisco, CA",Male,Georgia Institute of Technology,Aeronautical Engineering,,Captain,US Navy (Retired),6,835,3,20.0,"Gemini 3, Gemini 10, Apollo 10, Apollo 16, STS...",,
356,George D. Zamka,1998.0,17.0,Retired,6/29/1962,"Jersey City, NJ",Male,US Naval Academy; Florida Institute of Technology,Mathematics,Engineering Management,Colonel,US Marine Corps (Retired),2,692,0,0.0,"STS-120 (Discovery), STS-130 (Endeavor)",,


In [24]:
# Pull 3 Random Samples from the Data

df.sample(3)

# Get the Same Row multiple time - May get the Same row. useful for "Bootstrap Resampling"

df.sample(3, replace= True)

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,Military Rank,Military Branch,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr),Missions,Death Date,Death Mission
54,Kenneth D. Cameron,1984.0,10.0,Retired,11/29/1949,"Cleveland, OH",Male,MIT; Michigan State University,Aeronautics & Astronautics,Aeronautics & Astronautics; Business Administr...,Colonel,US Marine Corps (Retired),3,562,0,0.0,"STS-37 (Atlantis), STS-56 (Discovery), STS-74 ...",,
6,Joseph P. Allen,1967.0,6.0,Retired,6/27/1937,"Crawsfordsville, IN",Male,DePauw University; Yale University,Mathematics & Physics,Physics,,,2,313,2,12.0,"ST-5 (Columbia), STS 51-A (Discovery)",,
109,Anna L. Fisher,1978.0,8.0,Management,8/24/1949,"New York, NY",Female,University of California-Los Angeles,Chemistry,Chemistry; Medicine,,,1,191,0,0.0,STS 51-A (Discovery),,


In [25]:
# Get the Info of the Dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357 entries, 0 to 356
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Name                 357 non-null    object 
 1   Year                 330 non-null    float64
 2   Group                330 non-null    float64
 3   Status               357 non-null    object 
 4   Birth Date           357 non-null    object 
 5   Birth Place          357 non-null    object 
 6   Gender               357 non-null    object 
 7   Alma Mater           356 non-null    object 
 8   Undergraduate Major  335 non-null    object 
 9   Graduate Major       298 non-null    object 
 10  Military Rank        207 non-null    object 
 11  Military Branch      211 non-null    object 
 12  Space Flights        357 non-null    int64  
 13  Space Flight (hr)    357 non-null    int64  
 14  Space Walks          357 non-null    int64  
 15  Space Walks (hr)     357 non-null    flo

In [26]:
# Get the Descriptive Stats for Numerical Values
df.describe()

Unnamed: 0,Year,Group,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr)
count,330.0,330.0,357.0,357.0,357.0,357.0
mean,1985.106061,11.409091,2.364146,1249.266106,1.246499,7.707283
std,13.216147,5.149962,1.4287,1896.759857,2.056989,13.367973
min,1959.0,1.0,0.0,0.0,0.0,0.0
25%,1978.0,8.0,1.0,289.0,0.0,0.0
50%,1987.0,12.0,2.0,590.0,0.0,0.0
75%,1996.0,16.0,3.0,1045.0,2.0,12.0
max,2009.0,20.0,7.0,12818.0,10.0,67.0


In [28]:
# Get the Rows and Columns of the Data
df.shape

(357, 19)

In [29]:
# Get the Correlation between the Numeric Column
df.corr()

Unnamed: 0,Year,Group,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr)
Year,1.0,0.980934,0.03642,0.331386,0.210073,0.253502
Group,0.980934,1.0,-0.011386,0.325683,0.217891,0.261384
Space Flights,0.03642,-0.011386,1.0,0.325233,0.257073,0.258642
Space Flight (hr),0.331386,0.325683,0.325233,1.0,0.472796,0.454408
Space Walks,0.210073,0.217891,0.257073,0.472796,1.0,0.985755
Space Walks (hr),0.253502,0.261384,0.258642,0.454408,0.985755,1.0


In [30]:
# Get the Value Counts of the Year

df['Year'].value_counts()

1996.0    35
1978.0    35
1998.0    25
1990.0    23
1966.0    19
1995.0    19
1980.0    19
1992.0    19
1984.0    18
2000.0    17
1987.0    15
1963.0    14
1985.0    13
1967.0    11
2004.0    11
2009.0     9
1962.0     8
1969.0     7
1959.0     7
1965.0     6
Name: Year, dtype: int64

In [31]:
# Get the Maximum value for Every Column

df.max()

Name                 Yvonne D. Cagle
Year                            2009
Group                             20
Status                       Retired
Birth Date                  9/9/1952
Birth Place              Yonkers, NY
Gender                          Male
Space Flights                      7
Space Flight (hr)              12818
Space Walks                       10
Space Walks (hr)                  67
dtype: object