In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action = 'ignore')

Problem statement: 
A marketing firm wants to launch a promotional campaign in different regions of country. In order to do so, the firm needs to understand the diversity in the population demography so that it can plan the campaigns accordingly.


In [None]:
data = pd.read_csv('/kaggle/input/popuation-data-1/Population_Data.csv')

In [None]:
data.head()

In [None]:
data.info()

Most of the data are object types which is false. python is behaving like this becuase values are comma separated

In [None]:
numeric = ['Indians', 'Foreigners', 
           'Indian_Male', 'Indian_Female', 
           'Foreigners_Male', 'Foreigners_Female', 
           'Total Population']


In [None]:
def cleaner(z):
    return z.replace(',', '')

In [None]:
for i in data[numeric]:
    data[i] = data[i].apply(cleaner)
    
data.head()

In [None]:
data.info()

In [None]:
data[numeric] = data[numeric].apply(pd.to_numeric)
data.info()

# Integrity check. total population = indian + foreigner

In [None]:
data[['Indians', 'Foreigners']].sum().sum()-data['Total Population'].sum()

Data is right since the population is sum of indian and foreigeners

In [None]:
data[['Indian_Male', 'Indian_Female', 'Foreigners_Male', 'Foreigners_Female']].sum().sum()-data['Total Population'].sum()

clearly population > male + female. Therefore there are people who donot identify themselves as males or females

In [None]:
#adding new feature for other gender
mf_Sum = data['Indian_Male'] + data['Indian_Female'] + data['Foreigners_Male'] + data['Foreigners_Female']
data['other'] = data['Total Population'] - mf_Sum
data.head()

In [None]:
data['Region'].nunique(), data['Office Location Id'].nunique()

Both of these columns are contributing nothing so we will not consider them while modelling. Also total population will be ignored.

In [None]:
data1 =data.drop(columns = ['Region', 'Office Location Id', 'Total Population'])
data1.head()

In [None]:
from sklearn.preprocessing import Normalizer
norm = Normalizer()
columns = data1.columns
data1 = norm.fit_transform(data1)
data1 = pd.DataFrame(data1, columns=columns)
data1.head()

# CLUSTERING

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(data1)
pred = kmeans.predict(data1)

In [None]:
pred, len(pred)

In [None]:
kmeans.inertia_

# getting right number of clusters using inertia vs no of clusters

In [None]:
SSE = []
for clusters in range(1, 10):
    kmeans = KMeans(n_clusters= clusters)
    kmeans.fit(data1)
    SSE.append(kmeans.inertia_)

In [None]:
frame = pd.DataFrame({'Cluster':range(1,10), 'SSE':SSE})

plt.figure(figsize = (12, 6))
plt.plot(frame['Cluster'], frame['SSE'], marker = 'o')
plt.xlabel('Number of clusters')
plt.ylabel('INERTIA')


In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(data1)
pred = kmeans.predict(data1)

In [None]:
data1['clusters'] = pred

In [None]:
def seg(str_x, str_y, cluster):
    x = []
    y = []
    
    for i in range(clusters):
        x.append(data1[str_x][data1['clusters'] == i])
        y.append(data1[str_y][data1['clusters'] == i])
        
    return x, y

def plot_clusters(str_x, str_y, cluster):
    plt.figure(figsize=(5, 5), dpi = 120)
    
    x, y = seg(str_x, str_y, cluster)
    
    for i in range(cluster):
        plt.scatter(x[i], y[i], label = f'cluster{i}')
        
    plt.xlabel(str_x)
    plt.ylabel(str_y)
    plt.title(str(str_x + " vs "+ str_y))
    plt.legend()

In [None]:
plot_clusters('Indians', 'Foreigners', 3)

In [None]:
plot_clusters('Indian_Male', 'Foreigners_Male', 3)
