In [1]:
import os, sys
import numpy

#https://pandas.pydata.org/
import pandas

In [26]:
# The Titanic dataset: https://www.openml.org/d/40945

source = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'
#source = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html'


# Load data as a dataframe
dataframe = pandas.read_csv(source, sep = ',')
#dataframe = pandas.read_csv(source, sep='delimiter', header=None)

# Show first 5 rows
dataframe.head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [3]:
# Show dimensions
dataframe.shape

(1313, 6)

In [None]:
# Show statistics
dataframe.describe()

In [None]:
# Select first row
dataframe.iloc[0]

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                 29.0
Sex                               female
Survived                               1
SexCode                                1
Name: 0, dtype: object

In [None]:
# Select three rows (0 to 3)
dataframe.iloc[0:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [5]:
# Select a row using a name

# Set index
dataframe = dataframe.set_index(dataframe['Name'])
# Get the passenger information based on the name
passengername = 'Allen, Miss Elisabeth Walton'

dataframe.loc[passengername]

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                 29.0
Sex                               female
Survived                               1
SexCode                                1
Name: Allen, Miss Elisabeth Walton, dtype: object

In [6]:
# Conditional row selection

# Show top two rows where column 'sex' is 'female'
dataframe[dataframe['SexCode'] == 1].head(2)

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Allen, Miss Elisabeth Walton","Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
"Allison, Miss Helen Loraine","Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [7]:
# Filter rows
dataframe[(dataframe['Sex'] == 'female') & (dataframe['Age'] >= 65)]

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Crosby, Mrs Edward Gifford (Catherine Elizabeth Halstead)","Crosby, Mrs Edward Gifford (Catherine Elizabet...",1st,69.0,female,1,1


In [None]:
dataframe[(dataframe['Sex'] == 'male') & (dataframe['Age'] >= 65)]

In [None]:
# Calculate statistics

print('Maximum:', int(dataframe['Age'].max()))
print('Minimum:', int(dataframe['Age'].min()))
print('Mean:', int(dataframe['Age'].mean()))
print('Sum:', int(dataframe['Age'].sum()))
print('Count:', dataframe['Age'].count())

In [10]:
# Select unique values

dataframe['Sex'].unique()

array(['female', 'male'], dtype=object)

In [11]:
# Show counts
dataframe['Sex'].value_counts()


male      851
female    462
Name: Sex, dtype: int64

In [14]:
## Select missing values, show two rows

dataframe[dataframe['Age'].isnull()].head(3)

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Aubert, Mrs Leontine Pauline","Aubert, Mrs Leontine Pauline",1st,,female,1,1
"Barkworth, Mr Algernon H","Barkworth, Mr Algernon H",1st,,male,1,0
"Baumann, Mr John D","Baumann, Mr John D",1st,,male,0,0


In [17]:
#Drop duplicates, show first two rows of output

dataframe.drop_duplicates().head(2)
print(len(dataframe.drop_duplicates()))

1313


In [27]:
# Show number of rows impacted by this operation

print("Number Of Rows In The Original DataFrame:", len(dataframe))
print("Number Of Rows After Deduping:", len(dataframe.drop_duplicates()))

Number Of Rows In The Original DataFrame: 1313
Number Of Rows After Deduping: 1313


In [32]:
dataframe.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1308    False
1309    False
1310    False
1311    False
1312    False
Length: 1313, dtype: bool

In [36]:
nduplicated = len(dataframe[dataframe.duplicated(keep=False)])
print(nduplicated)

0


In [None]:
# Group rows by the values of the column 'Sex', calculate mean of each group

dataframe.groupby('Sex').mean()

Unnamed: 0_level_0,Age,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,29.396424,0.666667,1.0
male,31.014338,0.166863,0.0


In [None]:
# Loop over items in the dataframe and print first two names uppercased

for name in dataframe['Name'][0:2]:
  print(name.upper())
  print(name.lower())



In [21]:
# Apply a function on the elements of the dataframe

# Create function
def uppercase(x):
  x = x + 'a'
  return x.upper()

# Apply function, show two rows
dataframe['Name'].apply(uppercase)[0:2]

Name
Allen, Miss Elisabeth Walton    ALLEN, MISS ELISABETH WALTONA
Allison, Miss Helen Loraine      ALLISON, MISS HELEN LORAINEA
Name: Name, dtype: object

In [25]:

# Create function
def special(x):
  x = x + '_' + x
  return (x)

# Apply function, show two rows
dataframe['Name'].apply(special)[0:2]

Name
Allen, Miss Elisabeth Walton    Allen, Miss Elisabeth Walton_Allen, Miss Elisa...
Allison, Miss Helen Loraine     Allison, Miss Helen Loraine_Allison, Miss Hele...
Name: Name, dtype: object

wrangling numerical data ---------------------------------------------

In [38]:
import numpy
from sklearn import preprocessing

# Create feature
feature = numpy.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
print(feature)

[[-500.5]
 [-100.1]
 [   0. ]
 [ 100.1]
 [ 900.9]]


In [44]:
# Scale or standardize a feature
# Create min max scaler
minval = 0
maxval = 10
minmax_scale = preprocessing.MinMaxScaler(feature_range=(minval, maxval))

In [45]:
# Scale feature
scaled_feature = minmax_scale.fit_transform(feature)
print(scaled_feature)

[[ 0.        ]
 [ 2.85714286]
 [ 3.57142857]
 [ 4.28571429]
 [10.        ]]


In [None]:
# Create standard scaler (mean of 0 and a standard deviation of 1)
scaler = preprocessing.StandardScaler()
# Transform the feature
standardized = scaler.fit_transform(feature)

print(standardized)

[[-1.26687088]
 [-0.39316683]
 [-0.17474081]
 [ 0.0436852 ]
 [ 1.79109332]]


In [None]:
# Create a custom transforming feature
from sklearn.preprocessing import FunctionTransformer

# Define a simple function
def add_ten(x):
  return x + 10

# Create transformer
ten_transformer = FunctionTransformer(add_ten)

# Transform feature matrix
tt = ten_transformer.transform(feature)

print(tt)

[[-490.5]
 [ -90.1]
 [  10. ]
 [ 110.1]
 [ 910.9]]
