### 1. Initial Data Exploration (SQL)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy as db
import getpass

: 

In [None]:
# We noticed the data doesn't have column names, so we'll read it without adding
# headers and add the column names afterwards

data = pd.read_csv('creditcardmarketing.csv', header=None)
data.head()

: 

In [None]:
columns = ['customer_number', 'offer_accepted', 'reward', 'mailer_type', \
    'income_level', 'bank_accounts_open', 'overdraft_protection', \
    'credit_rating', 'credit_cards_held', 'homes_owned', 'household_size', \
    'home_owner', 'average_balance', 'balance_Q1', 'balance_Q2', 'balance_Q3', \
    'balance_Q4']

data.columns = columns

: 

In [None]:
data.to_csv('creditcardmarketing_with_headers.csv', index=False)

: 

### Instructions

#### 1. Create a database called `credit_card_classification`

In [None]:
password = getpass.getpass('Get password')

connection_string = 'mysql+pymysql://root:' + password + '@localhost/credit_card_classification'

# The echo=True argument allows us to see the queries sent by the engine
engine = db.create_engine(connection_string, echo=True)

# The credit_card_classification schema was created in MySQL Workbench, so next line is commented out
# engine.execute("CREATE DATABASE IF NOT EXISTS credit_card_classification")
# engine.execute("USE credit_card_classification")
metadata = db.MetaData()
conn = engine.connect()

: 

#### 2. Create a table `credit_card_data` with the same columns as given in the csv file. Please make sure you use the correct data types for each of the columns.

In [None]:
credit_card_data = db.Table('credit_card_data', metadata,
                    db.Column('customer_number', db.Integer()),
                    db.Column('offer_accepted', db.String(255), nullable=False),
                    db.Column('reward', db.String(255), nullable=False),
                    db.Column('mailer_type', db.String(255), nullable=False),
                    db.Column('income_level', db.String(255), nullable=False),
                    db.Column('bank_accounts_open', db.Integer()),
                    db.Column('overdraft_protection', db.String(255), nullable=False),
                    db.Column('credit_rating', db.String(255), nullable=False),
                    db.Column('credit_cards_held', db.Integer()),
                    db.Column('homes_owned', db.Integer()),
                    db.Column('household_size', db.Integer()),                   
                    db.Column('home_owner', db.String(255), nullable=False),
                    db.Column('average_balance', db.Float()),                   
                    db.Column('balance_Q1', db.Float()),                   
                    db.Column('balance_Q2', db.Float()),                   
                    db.Column('balance_Q3', db.Float()),                   
                    db.Column('balance_Q4', db.Float())                   
              )

metadata.create_all(engine)

: 

In [None]:
data = pd.read_sql_query("SELECT * FROM credit_card_data", engine)
data.columns

: 

#### 3. Import the data from the `.csv` file into the table. Before you import the data into the empty table, make sure that you have deleted the headers from the `.csv` file. To not modify the original data, if you want you can create a copy of the `.csv` file as well. Note you might have to use the following queries to give permission to SQL to import data from `.csv` files in bulk:

```sql
SHOW VARIABLES LIKE 'local_infile'; -- This query would show you the status of the variable ‘local_infile’. If it is off, use the next command, otherwise you should be good to go

SET GLOBAL local_infile = 1;
```

In [None]:
# This task was done in MySQL Workbench using the GUI import table option.

: 

#### 4.  Select all the data from table `credit_card_data` to check if the data was imported correctly.

In [None]:
query = "SELECT * FROM credit_card_data"
data = pd.read_sql_query(query, engine)
data.head()

: 

#### 5.  Use the _alter table_ command to drop the column `q4_balance` from the database, as we would not use it in the analysis with SQL. Select all the data from the table to verify if the command worked. Limit your returned results to 10.

In [None]:
query = "ALTER TABLE credit_card_data DROP COLUMN balance_Q4"
conn.execute(query)

: 

In [None]:
query = "SELECT * FROM credit_card_data LIMIT 10"
top_ten = pd.read_sql_query(query, engine)
top_ten.head()

: 

#### 6.  Use sql query to find how many rows of data you have.

In [None]:
query = '''SELECT COUNT(*) AS number_of_rows 
           FROM credit_card_data'''
           
rows = pd.read_sql_query(query, engine)
print(rows)

: 

#### 7.  Now we will try to find the unique values in some of the categorical columns:

    - What are the unique values in the column `Offer_accepted`?

In [None]:
query = '''SELECT DISTINCT offer_accepted 
           FROM credit_card_data'''
           
unique_values = pd.read_sql_query(query, engine)
print(unique_values)

: 

    - What are the unique values in the column `Reward`?

In [None]:
query = '''SELECT DISTINCT reward 
           FROM credit_card_data'''
           
unique_values = pd.read_sql_query(query, engine)
print(unique_values)

: 

    - What are the unique values in the column `mailer_type`?

In [None]:
query = '''SELECT DISTINCT mailer_type 
           FROM credit_card_data'''
           
unique_values = pd.read_sql_query(query, engine)
print(unique_values)

: 

    - What are the unique values in the column `credit_cards_held`?

In [None]:
query = '''SELECT DISTINCT credit_cards_held 
           FROM credit_card_data'''
           
unique_values = pd.read_sql_query(query, engine)
print(unique_values)

: 

    - What are the unique values in the column `household_size`?

In [None]:
query = '''SELECT DISTINCT household_size 
           FROM credit_card_data
           ORDER BY household_size ASC'''
           
unique_values = pd.read_sql_query(query, engine)
print(unique_values)

: 

#### 8.  Arrange the data in a decreasing order by the `average_balance` of the house. Return only the `customer_number` of the top 10 customers with the highest `average_balances` in your data.

In [None]:
query = '''SELECT customer_number
FROM credit_card_data
ORDER BY average_balance DESC
LIMIT 10'''

customers = pd.read_sql_query(query, engine)
print(customers)

: 

#### 9.  What is the average balance of all the customers in your data?

In [None]:
query = '''SELECT ROUND(AVG(average_balance), 1) AS average_balance
FROM credit_card_data'''

avg_balance = pd.read_sql_query(query, engine)
print(avg_balance)

: 

#### 10. In this exercise we will use simple group by to check the properties of some of the categorical variables in our data. Note wherever `average_balance` is asked, please take the average of the column `average_balance`: 

    - What is the average balance of the customers grouped by `Income Level`? The returned result should have only two columns, income level and `Average balance` of the customers. Use an alias to change the name of the second column.

In [None]:
query = '''SELECT income_level, ROUND(AVG(average_balance), 1) AS avg_balance
FROM credit_card_data
GROUP BY income_level'''

avg_balance_per_income = pd.read_sql_query(query, engine)
print(avg_balance_per_income)

: 

    - What is the average balance of the customers grouped by `number_of_bank_accounts_open`? The returned result should have only two columns, `number_of_bank_accounts_open` and `Average balance` of the customers. Use an alias to change the name of the second column.

In [None]:
query = '''SELECT bank_accounts_open, ROUND(AVG(average_balance), 1) AS avg_balance
FROM credit_card_data
GROUP BY bank_accounts_open'''

avg_balance_per_bank_accounts = pd.read_sql_query(query, engine)
print(avg_balance_per_bank_accounts)

: 

    - What is the average number of credit cards held by customers for each of the credit card ratings? The returned result should have only two columns, rating and average number of credit cards held. Use an alias to change the name of the second column.

In [None]:
query = '''SELECT credit_rating, ROUND(AVG(credit_cards_held), 1) AS avg_number_of_cards
FROM credit_card_data
GROUP BY credit_rating'''

avg_balance_per_credit_rating = pd.read_sql_query(query, engine)
print(avg_balance_per_credit_rating)

: 

    - Is there any correlation between the columns `credit_cards_held` and `number_of_bank_accounts_open`? You can analyse this by grouping the data by one of the variables and then aggregating the results of the other column. Visually check if there is a positive correlation or negative correlation or no correlation between the variables.

In [None]:
query = '''SELECT bank_accounts_open, ROUND(AVG(credit_cards_held), 1) AS avg_number_of_cards
FROM credit_card_data
GROUP BY bank_accounts_open'''

cc_held_per_bank_accounts_open = pd.read_sql_query(query, engine)
print(cc_held_per_bank_accounts_open)

: 

#### 11. Your managers are only interested in the customers with the following properties:

    - Credit rating medium or high
    - Credit cards held 2 or less
    - Owns their own home
    - Household size 3 or more

For the rest of the things, they are not too concerned. Write a simple query to find what are the options available for them? Can you filter the customers who accepted the offers here?

In [None]:
query = '''SELECT *
FROM credit_card_data
WHERE (credit_rating = 'Medium' OR credit_rating = 'High') AND
credit_cards_held <= 2 AND
home_owner = 'Yes' AND 
household_size >= 3
'''

relevant_data = pd.read_sql_query(query, engine)
relevant_data.head()

: 

#### 12. Your managers want to find out the list of customers whose average balance is less than the average balance of all the customers in the database. Write a query to show them the list of such customers. You might need to use a subquery for this problem.

In [None]:
query = '''SELECT customer_number, average_balance
FROM credit_card_data
WHERE average_balance < (SELECT AVG(average_balance) FROM credit_card_data)'''

lower_than_avg = pd.read_sql_query(query, engine)

# Check the filtering worked correctly
max(lower_than_avg.average_balance)

: 

#### 13. Since this is something that the senior management is regularly interested in, create a view of the same query.

In [None]:
query = '''CREATE VIEW low_balance_customers AS
SELECT customer_number, average_balance
FROM credit_card_data
WHERE average_balance < (SELECT AVG(average_balance) FROM credit_card_data)'''

: 

#### 14. What is the number of people who accepted the offer vs number of people who did not?

In [None]:
rejected = data.offer_accepted.value_counts().loc['No']
accepted = data.offer_accepted.value_counts().loc['Yes']

print(rejected, "clients rejected the offer")
print(accepted, "clients accepted the offer")

: 

#### 15. Your managers are more interested in customers with a credit rating of high or medium. What is the difference in average balances of the customers with high credit card rating and low credit card rating?

In [None]:
query = '''SELECT credit_rating, ROUND(AVG(average_balance), 1) AS avg_balance
FROM credit_card_data
WHERE credit_rating = 'High' OR credit_rating = 'Low' 
GROUP BY credit_rating'''

avg_balance_per_rating = pd.read_sql_query(query, engine)
print(avg_balance_per_rating)

: 

In [None]:
difference = avg_balance_per_rating.iloc[0, 1] - \
    avg_balance_per_rating.iloc[1, 1]

print(round(difference, 1))

: 

#### 16. In the database, which all types of communication (`mailer_type`) were used and with how many customers?

In [None]:
print(data.mailer_type.value_counts())

: 

#### 17. Provide the details of the customer that is the 11th least `Q1_balance` in your database.

In [None]:
query = '''SELECT * 
FROM
(SELECT *,
DENSE_RANK() OVER(ORDER BY balance_Q1 ASC) AS Q1_balance_ranking
FROM credit_card_data) AS ranked_data
WHERE Q1_balance_ranking = 11;
'''

customer_info = pd.read_sql_query(query, engine)
print(customer_info) # There are multiple customers with the 11th least Q1 balance

: 