In [5]:
# ms-python.python added
import os
try:
	os.chdir(os.path.join(os.getcwd(), 'module4'))
	print(os.getcwd())
except:
	pass

In [6]:
from IPython import get_ipython


 Lambda School Data Science, Unit 2: Predictive Modeling

 # Regression & Classification, Module 4


 ## Assignment

 - [ ] Watch Aaron's [video #1](https://www.youtube.com/watch?v=pREaWFli-5I) (12 minutes) & [video #2](https://www.youtube.com/watch?v=bDQgVt4hFgY) (9 minutes) to learn about the mathematics of Logistic Regression.
 - [ ] [Sign up for a Kaggle account](https://www.kaggle.com/), if you don’t already have one. Go to our Kaggle InClass competition website. You will be given the URL in Slack. Go to the Rules page. Accept the rules of the competition.
 - [ ] Do train/validate/test split with the Tanzania Waterpumps data.
 - [ ] Begin with baselines for classification.
 - [ ] Use scikit-learn for logistic regression.
 - [ ] Get your validation accuracy score.
 - [ ] Submit your predictions to our Kaggle competition. (Go to our Kaggle InClass competition webpage. Use the blue **Submit Predictions** button to upload your CSV file. Or you can use the Kaggle API to submit your predictions.)
 - [ ] Commit your notebook to your fork of the GitHub repo.

 ---


 ## Stretch Goals

 - [ ] Add your own stretch goal(s) !
 - [ ] Clean the data. For ideas, refer to [The Quartz guide to bad data](https://github.com/Quartz/bad-data-guide),  a "reference to problems seen in real-world data along with suggestions on how to resolve them." One of the issues is ["Zeros replace missing values."](https://github.com/Quartz/bad-data-guide#zeros-replace-missing-values)
 - [ ] Make exploratory visualizations.
 - [ ] Do one-hot encoding. For example, you could try `quantity`, `basin`, `extraction_type_class`, and more. (But remember it may not work with high cardinality categoricals.)
 - [ ] Do [feature scaling](https://scikit-learn.org/stable/modules/preprocessing.html).
 - [ ] Get and plot your coefficients.
 - [ ] Try [scikit-learn pipelines](https://scikit-learn.org/stable/modules/compose.html).

 ---

 ## Data Dictionary

 ### Features

 Your goal is to predict the operating condition of a waterpoint for each record in the dataset. You are provided the following set of information about the waterpoints:

 - `amount_tsh` : Total static head (amount water available to waterpoint)
 - `date_recorded` : The date the row was entered
 - `funder` : Who funded the well
 - `gps_height` : Altitude of the well
 - `installer` : Organization that installed the well
 - `longitude` : GPS coordinate
 - `latitude` : GPS coordinate
 - `wpt_name` : Name of the waterpoint if there is one
 - `num_private` :
 - `basin` : Geographic water basin
 - `subvillage` : Geographic location
 - `region` : Geographic location
 - `region_code` : Geographic location (coded)
 - `district_code` : Geographic location (coded)
 - `lga` : Geographic location
 - `ward` : Geographic location
 - `population` : Population around the well
 - `public_meeting` : True/False
 - `recorded_by` : Group entering this row of data
 - `scheme_management` : Who operates the waterpoint
 - `scheme_name` : Who operates the waterpoint
 - `permit` : If the waterpoint is permitted
 - `construction_year` : Year the waterpoint was constructed
 - `extraction_type` : The kind of extraction the waterpoint uses
 - `extraction_type_group` : The kind of extraction the waterpoint uses
 - `extraction_type_class` : The kind of extraction the waterpoint uses
 - `management` : How the waterpoint is managed
 - `management_group` : How the waterpoint is managed
 - `payment` : What the water costs
 - `payment_type` : What the water costs
 - `water_quality` : The quality of the water
 - `quality_group` : The quality of the water
 - `quantity` : The quantity of water
 - `quantity_group` : The quantity of water
 - `source` : The source of the water
 - `source_type` : The source of the water
 - `source_class` : The source of the water
 - `waterpoint_type` : The kind of waterpoint
 - `waterpoint_type_group` : The kind of waterpoint

 ### Labels

 There are three possible values:

 - `functional` : the waterpoint is operational and there are no repairs needed
 - `functional needs repair` : the waterpoint is operational, but needs repairs
 - `non functional` : the waterpoint is not operational

 ---

 ## Generate a submission

 Your code to generate a submission file may look like this:

 ```python
 # estimator is your model or pipeline, which you've fit on X_train

 # X_test is your pandas dataframe or numpy array,
 # with the same number of rows, in the same order, as test_features.csv,
 # and the same number of columns, in the same order, as X_train

 y_pred = estimator.predict(X_test)


 # Makes a dataframe with two columns, id and status_group,
 # and writes to a csv file, without the index

 sample_submission = pd.read_csv('sample_submission.csv')
 submission = sample_submission.copy()
 submission['status_group'] = y_pred
 submission.to_csv('your-submission-filename.csv', index=False)
 ```

 If you're working locally, the csv file is saved in the same directory as your notebook.

 If you're using Google Colab, you can use this code to download your submission csv file.

 ```python
 from google.colab import files
 files.download('your-submission-filename.csv')
 ```

 ---

In [7]:
import os, sys
in_colab = 'google.colab' in sys.modules

# If you're in Colab...
if in_colab:
    # Pull files from Github repo
    os.chdir('/content')
    get_ipython().system('git init .')
    get_ipython().system('git remote add origin https://github.com/LambdaSchool/DS-Unit-2-Regression-Classification.git')
    get_ipython().system('git pull origin master')
    
    # Install required python packages
    get_ipython().system('pip install -r requirements.txt')
    
    # Change into directory for module
    os.chdir('module4')



In [8]:
# Ignore this Numpy warning when using Plotly Express:
# FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='numpy')



In [9]:
# Read the Tanzania Waterpumps data
# train_features.csv : the training set features
# train_labels.csv : the training set labels
# test_features.csv : the test set features
# sample_submission.csv : a sample submission file in the correct format
    
import pandas as pd

train_features = pd.read_csv('../data/waterpumps/train_features.csv')
train_labels = pd.read_csv('../data/waterpumps/train_labels.csv')
test_features = pd.read_csv('../data/waterpumps/test_features.csv')
sample_submission = pd.read_csv('../data/waterpumps/sample_submission.csv')

assert train_features.shape == (59400, 40)
assert train_labels.shape == (59400, 2)
assert test_features.shape == (14358, 40)
assert sample_submission.shape == (14358, 2)



FileNotFoundError: [Errno 2] File b'../data/waterpumps/train_features.csv' does not exist: b'../data/waterpumps/train_features.csv'

In [10]:
# ms-python.python added
import os
try:
	os.chdir(os.path.join(os.getcwd(), 'module4'))
	print(os.getcwd())
except:
	pass

In [11]:
from IPython import get_ipython


 Lambda School Data Science, Unit 2: Predictive Modeling

 # Regression & Classification, Module 4


 ## Assignment

 - [ ] Watch Aaron's [video #1](https://www.youtube.com/watch?v=pREaWFli-5I) (12 minutes) & [video #2](https://www.youtube.com/watch?v=bDQgVt4hFgY) (9 minutes) to learn about the mathematics of Logistic Regression.
 - [ ] [Sign up for a Kaggle account](https://www.kaggle.com/), if you don’t already have one. Go to our Kaggle InClass competition website. You will be given the URL in Slack. Go to the Rules page. Accept the rules of the competition.
 - [ ] Do train/validate/test split with the Tanzania Waterpumps data.
 - [ ] Begin with baselines for classification.
 - [ ] Use scikit-learn for logistic regression.
 - [ ] Get your validation accuracy score.
 - [ ] Submit your predictions to our Kaggle competition. (Go to our Kaggle InClass competition webpage. Use the blue **Submit Predictions** button to upload your CSV file. Or you can use the Kaggle API to submit your predictions.)
 - [ ] Commit your notebook to your fork of the GitHub repo.

 ---


 ## Stretch Goals

 - [ ] Add your own stretch goal(s) !
 - [ ] Clean the data. For ideas, refer to [The Quartz guide to bad data](https://github.com/Quartz/bad-data-guide),  a "reference to problems seen in real-world data along with suggestions on how to resolve them." One of the issues is ["Zeros replace missing values."](https://github.com/Quartz/bad-data-guide#zeros-replace-missing-values)
 - [ ] Make exploratory visualizations.
 - [ ] Do one-hot encoding. For example, you could try `quantity`, `basin`, `extraction_type_class`, and more. (But remember it may not work with high cardinality categoricals.)
 - [ ] Do [feature scaling](https://scikit-learn.org/stable/modules/preprocessing.html).
 - [ ] Get and plot your coefficients.
 - [ ] Try [scikit-learn pipelines](https://scikit-learn.org/stable/modules/compose.html).

 ---

 ## Data Dictionary

 ### Features

 Your goal is to predict the operating condition of a waterpoint for each record in the dataset. You are provided the following set of information about the waterpoints:

 - `amount_tsh` : Total static head (amount water available to waterpoint)
 - `date_recorded` : The date the row was entered
 - `funder` : Who funded the well
 - `gps_height` : Altitude of the well
 - `installer` : Organization that installed the well
 - `longitude` : GPS coordinate
 - `latitude` : GPS coordinate
 - `wpt_name` : Name of the waterpoint if there is one
 - `num_private` :
 - `basin` : Geographic water basin
 - `subvillage` : Geographic location
 - `region` : Geographic location
 - `region_code` : Geographic location (coded)
 - `district_code` : Geographic location (coded)
 - `lga` : Geographic location
 - `ward` : Geographic location
 - `population` : Population around the well
 - `public_meeting` : True/False
 - `recorded_by` : Group entering this row of data
 - `scheme_management` : Who operates the waterpoint
 - `scheme_name` : Who operates the waterpoint
 - `permit` : If the waterpoint is permitted
 - `construction_year` : Year the waterpoint was constructed
 - `extraction_type` : The kind of extraction the waterpoint uses
 - `extraction_type_group` : The kind of extraction the waterpoint uses
 - `extraction_type_class` : The kind of extraction the waterpoint uses
 - `management` : How the waterpoint is managed
 - `management_group` : How the waterpoint is managed
 - `payment` : What the water costs
 - `payment_type` : What the water costs
 - `water_quality` : The quality of the water
 - `quality_group` : The quality of the water
 - `quantity` : The quantity of water
 - `quantity_group` : The quantity of water
 - `source` : The source of the water
 - `source_type` : The source of the water
 - `source_class` : The source of the water
 - `waterpoint_type` : The kind of waterpoint
 - `waterpoint_type_group` : The kind of waterpoint

 ### Labels

 There are three possible values:

 - `functional` : the waterpoint is operational and there are no repairs needed
 - `functional needs repair` : the waterpoint is operational, but needs repairs
 - `non functional` : the waterpoint is not operational

 ---

 ## Generate a submission

 Your code to generate a submission file may look like this:

 ```python
 # estimator is your model or pipeline, which you've fit on X_train

 # X_test is your pandas dataframe or numpy array,
 # with the same number of rows, in the same order, as test_features.csv,
 # and the same number of columns, in the same order, as X_train

 y_pred = estimator.predict(X_test)


 # Makes a dataframe with two columns, id and status_group,
 # and writes to a csv file, without the index

 sample_submission = pd.read_csv('sample_submission.csv')
 submission = sample_submission.copy()
 submission['status_group'] = y_pred
 submission.to_csv('your-submission-filename.csv', index=False)
 ```

 If you're working locally, the csv file is saved in the same directory as your notebook.

 If you're using Google Colab, you can use this code to download your submission csv file.

 ```python
 from google.colab import files
 files.download('your-submission-filename.csv')
 ```

 ---

In [12]:
import os, sys
in_colab = 'google.colab' in sys.modules

# If you're in Colab...
if in_colab:
    # Pull files from Github repo
    os.chdir('/content')
    get_ipython().system('git init .')
    get_ipython().system('git remote add origin https://github.com/LambdaSchool/DS-Unit-2-Regression-Classification.git')
    get_ipython().system('git pull origin master')
    
    # Install required python packages
    get_ipython().system('pip install -r requirements.txt')
    
    # Change into directory for module
    os.chdir('module4')



In [13]:
# Ignore this Numpy warning when using Plotly Express:
# FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='numpy')



In [1]:
# ms-python.python added
import os
try:
	os.chdir(os.path.join(os.getcwd(), 'module4'))
	print(os.getcwd())
except:
	pass

/home/qweliantanner/MEGA/codin/lambda/DS-Unit-2-Regression-Classification/module4


In [2]:
from IPython import get_ipython


 Lambda School Data Science, Unit 2: Predictive Modeling

 # Regression & Classification, Module 4


 ## Assignment

 - [ ] Watch Aaron's [video #1](https://www.youtube.com/watch?v=pREaWFli-5I) (12 minutes) & [video #2](https://www.youtube.com/watch?v=bDQgVt4hFgY) (9 minutes) to learn about the mathematics of Logistic Regression.
 - [ ] [Sign up for a Kaggle account](https://www.kaggle.com/), if you don’t already have one. Go to our Kaggle InClass competition website. You will be given the URL in Slack. Go to the Rules page. Accept the rules of the competition.
 - [ ] Do train/validate/test split with the Tanzania Waterpumps data.
 - [ ] Begin with baselines for classification.
 - [ ] Use scikit-learn for logistic regression.
 - [ ] Get your validation accuracy score.
 - [ ] Submit your predictions to our Kaggle competition. (Go to our Kaggle InClass competition webpage. Use the blue **Submit Predictions** button to upload your CSV file. Or you can use the Kaggle API to submit your predictions.)
 - [ ] Commit your notebook to your fork of the GitHub repo.

 ---


 ## Stretch Goals

 - [ ] Add your own stretch goal(s) !
 - [ ] Clean the data. For ideas, refer to [The Quartz guide to bad data](https://github.com/Quartz/bad-data-guide),  a "reference to problems seen in real-world data along with suggestions on how to resolve them." One of the issues is ["Zeros replace missing values."](https://github.com/Quartz/bad-data-guide#zeros-replace-missing-values)
 - [ ] Make exploratory visualizations.
 - [ ] Do one-hot encoding. For example, you could try `quantity`, `basin`, `extraction_type_class`, and more. (But remember it may not work with high cardinality categoricals.)
 - [ ] Do [feature scaling](https://scikit-learn.org/stable/modules/preprocessing.html).
 - [ ] Get and plot your coefficients.
 - [ ] Try [scikit-learn pipelines](https://scikit-learn.org/stable/modules/compose.html).

 ---

 ## Data Dictionary

 ### Features

 Your goal is to predict the operating condition of a waterpoint for each record in the dataset. You are provided the following set of information about the waterpoints:

 - `amount_tsh` : Total static head (amount water available to waterpoint)
 - `date_recorded` : The date the row was entered
 - `funder` : Who funded the well
 - `gps_height` : Altitude of the well
 - `installer` : Organization that installed the well
 - `longitude` : GPS coordinate
 - `latitude` : GPS coordinate
 - `wpt_name` : Name of the waterpoint if there is one
 - `num_private` :
 - `basin` : Geographic water basin
 - `subvillage` : Geographic location
 - `region` : Geographic location
 - `region_code` : Geographic location (coded)
 - `district_code` : Geographic location (coded)
 - `lga` : Geographic location
 - `ward` : Geographic location
 - `population` : Population around the well
 - `public_meeting` : True/False
 - `recorded_by` : Group entering this row of data
 - `scheme_management` : Who operates the waterpoint
 - `scheme_name` : Who operates the waterpoint
 - `permit` : If the waterpoint is permitted
 - `construction_year` : Year the waterpoint was constructed
 - `extraction_type` : The kind of extraction the waterpoint uses
 - `extraction_type_group` : The kind of extraction the waterpoint uses
 - `extraction_type_class` : The kind of extraction the waterpoint uses
 - `management` : How the waterpoint is managed
 - `management_group` : How the waterpoint is managed
 - `payment` : What the water costs
 - `payment_type` : What the water costs
 - `water_quality` : The quality of the water
 - `quality_group` : The quality of the water
 - `quantity` : The quantity of water
 - `quantity_group` : The quantity of water
 - `source` : The source of the water
 - `source_type` : The source of the water
 - `source_class` : The source of the water
 - `waterpoint_type` : The kind of waterpoint
 - `waterpoint_type_group` : The kind of waterpoint

 ### Labels

 There are three possible values:

 - `functional` : the waterpoint is operational and there are no repairs needed
 - `functional needs repair` : the waterpoint is operational, but needs repairs
 - `non functional` : the waterpoint is not operational

 ---

 ## Generate a submission

 Your code to generate a submission file may look like this:

 ```python
 # estimator is your model or pipeline, which you've fit on X_train

 # X_test is your pandas dataframe or numpy array,
 # with the same number of rows, in the same order, as test_features.csv,
 # and the same number of columns, in the same order, as X_train

 y_pred = estimator.predict(X_test)


 # Makes a dataframe with two columns, id and status_group,
 # and writes to a csv file, without the index

 sample_submission = pd.read_csv('sample_submission.csv')
 submission = sample_submission.copy()
 submission['status_group'] = y_pred
 submission.to_csv('your-submission-filename.csv', index=False)
 ```

 If you're working locally, the csv file is saved in the same directory as your notebook.

 If you're using Google Colab, you can use this code to download your submission csv file.

 ```python
 from google.colab import files
 files.download('your-submission-filename.csv')
 ```

 ---

In [3]:
import os, sys
in_colab = 'google.colab' in sys.modules

# If you're in Colab...
if in_colab:
    # Pull files from Github repo
    os.chdir('/content')
    get_ipython().system('git init .')
    get_ipython().system('git remote add origin https://github.com/LambdaSchool/DS-Unit-2-Regression-Classification.git')
    get_ipython().system('git pull origin master')
    
    # Install required python packages
    get_ipython().system('pip install -r requirements.txt')
    
    # Change into directory for module
    os.chdir('module4')



In [4]:
# Ignore this Numpy warning when using Plotly Express:
# FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='numpy')



In [5]:
# Read the Tanzania Waterpumps data
# train_features.csv : the training set features
# train_labels.csv : the training set labels
# test_features.csv : the test set features
# sample_submission.csv : a sample submission file in the correct format
    
import pandas as pd

train_features = pd.read_csv('../data/waterpumps/train_features.csv')
train_labels = pd.read_csv('../data/waterpumps/train_labels.csv')
test_features = pd.read_csv('../data/waterpumps/test_features.csv')
sample_submission = pd.read_csv('../data/waterpumps/sample_submission.csv')

assert train_features.shape == (59400, 40)
assert train_labels.shape == (59400, 2)
assert test_features.shape == (14358, 40)
assert sample_submission.shape == (14358, 2)



In [6]:
train_features.shape


(59400, 40)

In [7]:
train_labels.shape


(59400, 2)

In [8]:
test_features.shape

(14358, 40)

In [9]:
sample_submission.shape


(14358, 2)

In [10]:
train_features.head(45)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.1553,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
5,9944,20.0,2011-03-13,Mkinga Distric Coun,0,DWE,39.172796,-4.765587,Tajiri,0,...,per bucket,salty,salty,enough,enough,other,other,unknown,communal standpipe multiple,communal standpipe
6,19816,0.0,2012-10-01,Dwsp,0,DWSP,33.36241,-3.766365,Kwa Ngomho,0,...,never pay,soft,good,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump
7,54551,0.0,2012-10-09,Rwssp,0,DWE,32.620617,-4.226198,Tushirikiane,0,...,unknown,milky,milky,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
8,53934,0.0,2012-11-03,Wateraid,0,Water Aid,32.7111,-5.146712,Kwa Ramadhan Musa,0,...,never pay,salty,salty,seasonal,seasonal,machine dbh,borehole,groundwater,hand pump,hand pump
9,46144,0.0,2011-08-03,Isingiro Ho,0,Artisan,30.626991,-1.257051,Kwapeto,0,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump


In [11]:
train_labels.head(45)


Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional
5,9944,functional
6,19816,non functional
7,54551,non functional
8,53934,non functional
9,46144,functional


In [12]:
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [13]:
train_features['id'].isin(train_labels['id']).value_counts()

True    59400
Name: id, dtype: int64

In [14]:
train = pd.merge(train_labels, train_features, how='right', on=['id'])

In [15]:
train.head()

Unnamed: 0,id,status_group,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,functional,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,functional,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,functional,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,non functional,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,functional,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [16]:
train.status_group.replace({'functional':0, 'functional needs repair':1, 'non functional':2}, inplace=True)

In [17]:
training_day, training_val = train_test_split(train, random_state=42)
training_day.shape, training_val.shape

((44550, 41), (14850, 41))

In [18]:
# BASELINE FOR CLASS STATUS_GROUP
target = "status_group"
y_train = training_day[target]
y_train.value_counts(normalize = True)

0    0.542334
2    0.384871
1    0.072795
Name: status_group, dtype: float64

In [19]:
y_train.mode()[0]
# the majority would thus be functional, though a worry would be that many are non functional

0

In [20]:
major_class = y_train.mode()[0]
y_pred1 = [major_class] * len(y_train)


In [21]:
sum(abs(y_pred1 - y_train))/ len(y_train)


0.8425364758698092

In [22]:
# baseline prdictions validate what we observe from normalized values
accuracy_score(y_train, y_pred1)


0.542334455667789

In [23]:
# lets test the actual validation set
y_val = training_val[target]
y_pred2 = [major_class] * len(y_val)
accuracy_score(y_val, y_pred2)

0.5453198653198653

In [24]:
# i should take a look at some of the features to see what i include in the feature, asking myself what
# can impact the functioning of water


train.head()

# i like subvillage, region, installer, population, source_class, waterpoint_type_group, quantity_group, gps_height, region, scheme_management, management_group, payment

Unnamed: 0,id,status_group,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,0,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,0,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,2,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [25]:
target = "status_group"
features =['subvillage', 'region', 'installer', 'population', 'source_class', 'waterpoint_type_group', 'quantity_group', 'gps_height', 'region', 'scheme_management', 'management_group', 'payment']


X_train = training_day[features]
y_train = training_day[target]
X_val = training_val[features]
y_val = training_val[target]


In [26]:

#%%
encoder = c


NameError: name 'c' is not defined

In [27]:
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)


AttributeError: 'DataFrame' object has no attribute 'dtype'