In [None]:
# Second example - slightly more complicated this time

In [None]:
# Pandas for data handling
import pandas # https://pandas.pydata.org/
from pandas.plotting import scatter_matrix

print('Loading data from file ...')  # Now let's load the data
dataset = pandas.read_csv('floats.csv') # default is header=infer, change if column names are not in first row
print('done \n')

print('Removing rows with missing data ...')  # Make things simple
dataset = dataset.dropna()  # default is to drop any row that contains at least one missing value
print('done \n')

In [None]:
# Ok, what does the data look like?

# Display for pretty tables
from IPython.display import display

print('Sample rows from the dataset (top and bottom five):')  # Spot checks
display(dataset.head(5))
display(dataset.tail(5))
print('\n')

In [None]:
# Let's look at a numerical summary table

print('Summary of the dataset - Univariate statistics')   
display(dataset.describe(include='all'))
print('\n')

In [None]:
# Let's set up a problem: Can we predict 'callSign' using these three features:  'Depth', 'Temperature', 'Salinity' ?

In [None]:
print('Reading list of problem variables X and Y...')
X_name = [ 'Depth', 'Temperature', 'Salinity' ] # columns to focus on as predictors
X = dataset[X_name]   # only keep these columns as features
y_name = 'callSign'     # column to focus on as target
y = dataset[y_name]   # only keep this column as label 
print('done \n')

In [None]:
# Spot checks for X:

print('Sample rows from X (top and bottom):')
display(X.head(3))
display(X.tail(3))
print('\n')

print('Statistical summary of X:')
display(X.describe())
print('\n')

In [None]:
# Let's look at some graphical summaries

%matplotlib inline
import matplotlib.pyplot as pl  # https://matplotlib.org/

print('Summary of X - Univariate graphs')
print()

print('Histograms:')
X.hist(figsize=(15, 5), bins=75)  # bins ~= sqrt(N)
pl.show()

print('Box plots:')
X.plot(figsize=(15, 5), kind='box', subplots=True, sharex=False, sharey=False)
pl.show()

In [None]:
# More graphical summaries

import seaborn as sns   # https://seaborn.pydata.org/

print('Summary of X - Bivariate (column-pair) graphs:')

print('Correlation matrix:')
corr = X.corr()
sns.heatmap( corr, cmap='coolwarm', vmax=1.0, vmin=-1.0 );
pl.show()

print('Scatter plots:')
scatter_matrix(X, figsize=(16, 16))
pl.show()

In [None]:
# Spot checks for y:

print('Summary of y:')
display(y.value_counts())

In [None]:
# Time for ML

# Split data into training and testing datasets
from sklearn import model_selection

test_pct = 0.20   # reserve 20% of the data points for testing performance
seed = 7          # specifying the seed allows for repeatability
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_pct, random_state=seed)
print('done with setting aside data for testing')

In [None]:
# Now train the model on the data 

# algorithms
from sklearn.linear_model import LogisticRegression

print('training model...')
modelName = LogisticRegression(solver='liblinear')
modelName.fit(X_train, y_train)  # train the model on the training data
print('done with training the model')

In [None]:
# How does the model do with predictions? 

# Let's compute its accuracy on the data we set aside.
from sklearn.metrics import accuracy_score

print('computing accuracy...')
y_predicted = modelName.predict(X_test)  # make predictions using the validation data 
print('Accuracy:', accuracy_score(y_test, y_predicted))