# WRANGLE.PY

### Throughout the exercises for Regression in Python lessons, you will use the following example scenario: As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 per customer.
- The first step will be to acquire and prep the data. Do your work for this exercise in a file named wrangle.py.

### Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np 
import os
from env import host, user, password 
import wrangle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


#### 1. Acquire customer_id, monthly_charges, tenure, and total_charges from telco_churn database for all customers with a 2 year contract.

##### Aquire the Data

In [None]:
#connection to codeup database
def get_connection(db, user=user, host=host, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [None]:
#sql query used to get 2 year contract data
sql_query = '''
            SELECT customer_id, monthly_charges, tenure, total_charges, contract_type_id
            FROM customers
            JOIN contract_types USING(`contract_type_id`)
            WHERE contract_type_id = '3';
            '''

In [None]:
#function to acquire dataframe
def acquire_telco():
     if os.path.isfile('telco_churn.csv') == False:
         sql_query = '''
                        SELECT customer_id, monthly_charges, tenure, total_charges, contract_type_id
                        FROM customers
                        JOIN contract_types USING(`contract_type_id`)
                        WHERE contract_type_id = '3';
                    '''
         df = pd.read_sql(sql_query, get_connection('telco_churn'))
         df.to_csv('telco_churn.csv')
     else:
        df = pd.read_csv('telco_churn.csv', index_col=0)
     return df


In [None]:
df = acquire_telco()

In [None]:
df.head()

#### 2. Walk through the steps above using your new dataframe. You may handle the missing values however you feel is appropriate.

##### Sample and Summarize

In [None]:
#shape
df.shape

In [None]:
# info 
df.info()

<div class="alert alert-block alert-success">
<b>Takeaways:</b>
<br>- Total charges is an object type, when it should probably be int or float, so there might be nulls in here.</div>

In [None]:
#distribution of numerical variables
df.describe()

In [None]:
#lets look at total charges to see what might be causing the variable to 
#be labeled as an object
df.total_charges.value_counts()

In [None]:
#hard to see with so many values
#let's check for nulls
df.isna().sum()

<div class="alert alert-block alert-success">
<b>Takeaways:</b>
<br>- No nulls observed in current dataset, the nulls that were present in total charges were from MTM and 1 year contracts</div>

In [None]:
#10 white spaces
#utilize regex replacement to remove whitespace
df.total_charges = df.total_charges.replace(r'^\s*$', np.nan, regex=True)

In [None]:
missing = df.isnull().sum()
missing[missing > 0]

In [None]:
df[df.total_charges.isnull()]

In [None]:
#when tenure is 0 monthly charges is 0
df[df.tenure == 1]

In [None]:
#certainty for imputing
df[df.tenure == 2]
#roughly double

In [None]:
df = df.fillna(0)

In [None]:
df.info()

In [None]:
#Let's change total charges to numeric
df['total_charges'] = pd.to_numeric(df.total_charges, errors='coerce')
#or this way
#df['total_charges'] = df['total_charges'].astype('float')

In [None]:
#Check to see that it is now numeric
df.info()

In [None]:
#drop the columns we will not be utilizing


In [None]:
df = df.drop(columns=['contract_type_id'])

In [None]:
df.head()

##### Visualize Distribution

In [None]:
plt.figure(figsize=(16, 3))

for i, col in enumerate(['monthly_charges', 'tenure', 'total_charges']):  
    plot_number = i + 1 # i starts at 0, but plot nos should start at 1
    series = df[col]  
    plt.subplot(1,4, plot_number)
    plt.title(col)
    series.hist(bins=5)

In [None]:
#boxplot
sns.boxplot(data=df.drop(columns=['total_charges']))

In [None]:
sns.boxplot(data=df[['total_charges']])

##### Train, Test, Split

In [None]:
train_val, test = train_test_split(df, train_size=0.8, random_state=123)


In [None]:
train, validate = train_test_split(train_val, train_size=0.7, random_state=123)


In [None]:
#observe size of train
train.info()


#### 3. End with a python file wrangle.py that contains the function, wrangle_telco(), that will acquire the data and return a dataframe cleaned with no missing values.

##### Pipeline Function

In [None]:
#all reproducable functions used
def get_connection(db, user=user, host=host, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'
    
#function to acquire dataframe
def acquire_telco():
     if os.path.isfile('telco_churn.csv') == False:
         sql_query = '''
                        SELECT customer_id, monthly_charges, tenure, total_charges, contract_type_id
                        FROM customers
                        JOIN contract_types USING(`contract_type_id`)
                        WHERE contract_type_id = '3';
                    '''
         df = pd.read_sql(sql_query, get_connection('telco_churn'))
         df.to_csv('telco_churn.csv')
     else:
        df = pd.read_csv('telco_churn.csv', index_col=0)
     return df

def clean_telco(df):
    '''
    Takes in a df of telco customer monthly charges, tenure, total charges
    and cleans the data appropriately by ,
    and converting object data to numerical data
    as well as dropping cutomer_id and contract_type_id columns from the dataframe
    return: df, a cleaned pandas dataframe
    '''
    df.total_charges = df.total_charges.replace(r'^\s*$', np.nan, regex=True)
    df = df.fillna(0)
    df['total_charges'] = df['total_charges'].astype('float')
    df = df.drop(columns=['contract_type_id'])
    return df



def split_data(df):
    '''
    split our data,
    takes in a pandas dataframe
    returns: three pandas dataframes, train, test, and validate
    '''
    train_val, test = train_test_split(df, train_size=0.8, random_state=123)
    train, validate = train_test_split(train_val, train_size=0.7, random_state=123)
    return train, validate, test



In [None]:
def wrangle_telco():
    '''
    wrangle_telco will read in our telco dataset as a pandas df,
    clean the data,
    split the data,
    return: train, validate, test sets of pandas dataframes from telco data
    stratified on total_charges
    
    '''
    df = clean_telco(acquire_telco())
    return split_data(df)

In [4]:
df2 = wrangle.clean_telco(wrangle.acquire_telco())

In [5]:
df2.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25
1,0014-BMAQU,84.65,63,5377.8
2,0016-QLJIS,90.45,65,5957.9
3,0017-DINOC,45.2,54,2460.55
4,0017-IUDMW,116.8,72,8456.75


In [6]:
train1, val1, test1 = wrangle.wrangle_telco()

In [7]:
train1.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
1256,7501-IWUNG,73.8,61,4616.05
225,1303-SRDOK,69.05,55,3842.6
662,3967-VQOGC,24.9,67,1680.25
628,3777-XROBG,19.55,58,1079.65
824,5075-JSDKI,24.45,59,1493.1
