In [5]:
import pandas as pd
import numpy as np

In [2]:
customer_df = pd.read_csv('/home/mendis/Documents/Tutorials/coustomer segmentation/data.csv')

suggestions_cleaning = []

## Data exploration

### Basic data exploration

In [4]:
customer_df.shape

(541909, 8)

In [5]:
customer_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


From the Date column we can derive the date, time and weekday with some calcualtion. If Stock code points to a unique order type, we can remove the Description column. Noticed that the invoiceno is repeated. So another data frames can be created for unique invoice numbers and uniqe customer ID.


In [6]:
customer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [7]:
customer_df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


In [8]:
customer_df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [9]:
customer_df.isnull().sum()['CustomerID']*100/len(customer_df)

24.926694334288598

24% of customer IDs are null. Further investigation is needed.

In [10]:
print('Unique values of each columns\n')

dataframe = customer_df

for column in dataframe.columns:
    unique = len(customer_df[column].unique())
    print(str(column) + ' : '+ str(unique))

Unique values of each columns

InvoiceNo : 25900
StockCode : 4070
Description : 4224
Quantity : 722
InvoiceDate : 23260
UnitPrice : 1630
CustomerID : 4373
Country : 38


### Investigation of negative Quantities

Can quantity or Unit price be a negative value? Or this represents a return of items? or some error. And notice that min order and max order has the same value.

In [11]:
customer_df.loc[customer_df['Quantity']<0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,12/1/2010 9:41,27.50,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,12/1/2010 9:49,4.65,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,12/1/2010 10:24,1.65,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,12/1/2010 10:24,0.29,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,12/1/2010 10:24,0.29,17548.0,United Kingdom
...,...,...,...,...,...,...,...,...
540449,C581490,23144,ZINC T-LIGHT HOLDER STARS SMALL,-11,12/9/2011 9:57,0.83,14397.0,United Kingdom
541541,C581499,M,Manual,-1,12/9/2011 10:28,224.69,15498.0,United Kingdom
541715,C581568,21258,VICTORIAN SEWING BOX LARGE,-5,12/9/2011 11:57,10.95,15311.0,United Kingdom
541716,C581569,84978,HANGING HEART JAR T-LIGHT HOLDER,-1,12/9/2011 11:58,1.25,17315.0,United Kingdom


In [12]:
# it seems like there are 10624 records of negative values. Let's investigate. The primary hypothesis is these are 
# returns of goods made for pervious purchases and Discounts. 

In [13]:
len(customer_df.loc[customer_df['Quantity']<0])

10624

In [14]:
# total value as a percentage
temp_df = customer_df
temp_df['Value'] = customer_df['Quantity']*customer_df['UnitPrice']

revanue = sum(temp_df['Value'])

negative_df = customer_df.loc[customer_df['Quantity']<0]

total_negative = sum(negative_df['Value'])

print(total_negative*100/revanue)

-9.200201893522904


In [60]:
index_negative = customer_df.loc[customer_df['Quantity']<0].index.tolist()

count = []
for index in index_negative:
    customer = customer_df.iloc[index]
    customer_id = customer['CustomerID']
    description = customer['Description']
    previous_purchases_description = customer_df.loc[(customer_df['CustomerID']== customer_id) & 
                                                     (customer_df.index < index)]['Description'].tolist()
    if description in previous_purchases_description:
        count = np.append(count, 1)
    else:
        count = np.append(count, 0)
print(sum(count))
print(len(count))
print(sum(count)*100/len(count))

7705.0
10624
72.52447289156626


72% of negative values represents returns and this percentage increases when we did not take values in the begining of the year.

In [91]:
suggestions_cleaning = np.append(suggestions_cleaning,
          'After removing special labels, check negative value representation and sum them accordint to the custoemr')
suggestions_cleaning

array(['After removing special labels, check negative value representation and sum them accordint to the custoemr'],
      dtype='<U105')

### Investigating special labels in Stock Codes such as M and D

In [4]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)


stockcd = customer_df['StockCode'].unique()

special_labels = []

for i in range(0,len(stockcd)):
    if hasNumbers(stockcd[i])==False:
        special_labels = np.append(special_labels, stockcd[i])

special_labels


array(['POST', 'D', 'DOT', 'M', 'BANK CHARGES', 'S', 'AMAZONFEE', 'm',
       'DCGSSBOY', 'DCGSSGIRL', 'PADS', 'B', 'CRUK'], dtype='<U32')

In [146]:
# let's check description of for special labels, frequency of occurance and total value.
temp_df = customer_df
temp_df['Value'] = customer_df['Quantity']*customer_df['UnitPrice']

revanue = sum(temp_df['Value'])

for label in special_labels:
    description = customer_df.loc[customer_df['StockCode']==label]['Description'].unique()
    frequecny = len(customer_df.loc[customer_df['StockCode']==label]['Description'])
    percentage = sum(temp_df.loc[temp_df['StockCode']==label]['Value'])*100/revanue
    print('label and description : '+str(label)+ str(description))
    print('frequency : '+str(frequecny) + ' percentage : '+str(percentage))


label and description : POST['POSTAGE' nan]
frequency : 1256 percentage : 0.6794455544851231
label and description : D['Discount']
frequency : 77 percentage : -0.05843626690560821
label and description : DOT['DOTCOM POSTAGE' nan]
frequency : 710 percentage : 2.1158269725107677
label and description : M['Manual']
frequency : 571 percentage : -0.7045133959654735
label and description : BANK CHARGES['Bank Charges']
frequency : 37 percentage : -0.07361330071912449
label and description : S['SAMPLES']
frequency : 63 percentage : -0.03128302065919023
label and description : AMAZONFEE['AMAZON FEE']
frequency : 34 percentage : -2.2725300397568526
label and description : m['Manual']
frequency : 1 percentage : 2.6159888594418903e-05
label and description : DCGSSBOY['BOYS PARTY BAG']
frequency : 11 percentage : 0.0015444593050548106
label and description : DCGSSGIRL['GIRLS PARTY BAG']
frequency : 13 percentage : 0.0014816755724282047
label and description : PADS['PADS TO MATCH ALL CUSHIONS']
freq

All of these labels are recommended for removal

### Investigating the stock code 'M'

In [34]:
# nubmer of records having the stock code 'M

len(customer_df.loc[customer_df['StockCode']=='M'])

571

In [35]:
# nubmer of unique customer ID's assigned for M

len(customer_df.loc[customer_df['StockCode']=='M']['CustomerID'].unique())


277

In [38]:
# Let us investigate a few records containing the code

customer_df.loc[customer_df['StockCode']=='M'].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
2239,536569,M,Manual,1,12/1/2010 15:35,1.25,16274.0,United Kingdom
2250,536569,M,Manual,1,12/1/2010 15:35,18.95,16274.0,United Kingdom
5684,536865,M,Manual,1,12/3/2010 11:28,2.55,,United Kingdom
6798,536981,M,Manual,2,12/3/2010 14:26,0.85,14723.0,United Kingdom
7976,537077,M,Manual,12,12/5/2010 11:59,0.42,17062.0,United Kingdom


In [46]:
# so it's clear that 'M' doesn't always occure with a negative quantity. The primary hypothesis is these indicates
# customers who are not in the system. Let's see if a few customer ID's have returned

CustID = customer_df.loc[customer_df['StockCode']=='M']['CustomerID'].unique()

for customer in CustID:
    print(customer)
    print(customer_df.loc[customer_df['CustomerID']==customer].index)

16274.0
Int64Index([2185, 2186, 2187, 2188, 2189, 2190, 2191, 2192, 2193, 2194, 2195,
            2196, 2197, 2198, 2199, 2200, 2201, 2202, 2203, 2204, 2205, 2206,
            2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215, 2216, 2217,
            2218, 2219, 2220, 2221, 2222, 2223, 2224, 2225, 2226, 2227, 2228,
            2229, 2230, 2231, 2232, 2233, 2234, 2235, 2236, 2237, 2238, 2239,
            2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250,
            2251],
           dtype='int64')
nan
Int64Index([], dtype='int64')
14723.0
Int64Index([  6748,   6749,   6750,   6751,   6752,   6753,   6754,   6755,
              6756,   6757,
            ...
            511338, 511339, 511340, 511341, 511342, 511343, 511344, 511345,
            511346, 511347],
           dtype='int64', length=278)
17062.0
Int64Index([ 7960,  7961,  7962,  7963,  7964,  7965,  7966,  7967,  7968,
             7969,  7970,  7971,  7972,  7973,  7974,  7975,  7976,  7977,
             797

Int64Index([109300, 109301, 109302, 109303, 109304, 109305, 109306, 109307,
            109308, 109309, 109310, 109311, 109312, 109313, 109314, 109315,
            109316, 109317, 109318, 109319, 109335, 109336, 109337, 109338,
            109339, 109340, 109341, 109342, 109343],
           dtype='int64')
14543.0
Int64Index([ 23752,  23753,  23754,  23755,  23756,  70382,  70383,  70384,
             70385,  70386,
            ...
            452091, 452092, 452093, 452094, 452095, 459222, 529974, 529975,
            529976, 529977],
           dtype='int64', length=107)
12754.0
Int64Index([ 81821,  81822,  81823,  81824,  81825,  81826,  81827,  81828,
             81829,  81830,  81831,  81832,  81833,  81834,  81835,  81836,
             81837,  81838,  81839,  81840,  81841,  81842,  81843,  81844,
             81845,  81846,  81847,  81848,  81849,  81850,  81851,  81852,
             81853,  81854,  81855,  81856,  81857, 111519, 111928, 111929,
            111930, 111931, 111932

Int64Index([154461, 154462, 154463, 154464, 154465, 154466, 154467, 154468,
            154469, 194242, 194243, 360633, 360634, 360635, 360636, 360637,
            360638, 360639, 360640, 360641, 360642, 360643, 360644, 360645,
            360646, 360647, 360648, 360649, 475565, 475566, 475567, 475568,
            475569, 475570, 475571, 475572, 475573, 524458, 524459, 524460,
            524461, 524462],
           dtype='int64')
17949.0
Int64Index([  6607,  45492,  99110,  99111, 106784, 117005, 169823, 180752,
            180754, 180755, 180756, 192704, 200351, 200369, 208190, 212528,
            212529, 244402, 244403, 244404, 244413, 244414, 244415, 244416,
            244417, 244418, 244419, 244420, 244421, 244423, 244424, 244799,
            244800, 245136, 245137, 259970, 274449, 282260, 282261, 282262,
            282263, 282264, 282265, 282266, 282267, 282288, 296537, 304342,
            304419, 304420, 304421, 304430, 304431, 304432, 309359, 314225,
            319753, 32106

Int64Index([  4884,   4885,   6364,   6365,   6366,  58092,  58093,  58094,
             58095,  58096,
            ...
            513438, 513439, 513440, 513441, 513442, 513443, 513444, 513445,
            513446, 513447],
           dtype='int64', length=1420)
17531.0
Int64Index([212004, 212005, 212006, 212007, 212008, 212009, 212010, 212011,
            212012, 212013, 212014, 212015, 212016, 212017, 297908],
           dtype='int64')
14534.0
Int64Index([ 70398,  70576,  70577,  70578,  70579,  70580,  70581,  70582,
             70583,  70584,
            ...
            533771, 533772, 533773, 533774, 533775, 533776, 533777, 533778,
            533779, 533780],
           dtype='int64', length=558)
17243.0
Int64Index([  7419,   7420,   7421,   7422,   7423,   7424,   7425,   7426,
              7427,   7428,
            ...
            522179, 538100, 538101, 538102, 538103, 538104, 538105, 538106,
            538107, 538108],
           dtype='int64', length=407)
17865.0
Int64In

Int64Index([ 39003,  39004,  39005,  39006,  39007,  39008,  39009,  39010,
             39011,  39012,
            ...
            486187, 486188, 486189, 486190, 486191, 486192, 486193, 486194,
            486195, 486196],
           dtype='int64', length=166)
17078.0
Int64Index([438413, 438414, 438415, 438416, 438417, 438418, 438419, 438420,
            438421, 438422, 438423, 438424, 438425, 438426, 438427, 438428,
            438429, 438430, 438431, 438432, 438433, 438434, 438435, 438436,
            438437, 438438, 438439, 438451],
           dtype='int64')
17979.0
Int64Index([133268, 133269, 133270, 133271, 133272, 133273, 133274, 133275,
            133276, 133277,
            ...
            439451, 439452, 439453, 439454, 439455, 439456, 439457, 439458,
            439459, 439460],
           dtype='int64', length=147)
12577.0
Int64Index([ 32250,  32251,  32252,  32253,  32254,  32255,  32256,  32257,
             32258,  32259,
            ...
            440093, 440094, 440

In [59]:
# record shows these customers are returning customers so the code 'M' has a different meaning
# Let's check the total purchase value, code 'M' represents.


df_value = customer_df
df_value['Price'] = df_value['Quantity']*df_value['UnitPrice']


print('Label M as a percentage: '+str(sum(df_value.loc[df_value['StockCode']=='M']['Price'])
                                      *100/sum(df_value['Price'])))


Label M as a percentage: -0.7045133959654735


### Investigation of Discounts

In [31]:
# let's check the Customer Id 14527.0

customer_df.loc[customer_df['CustomerID']==14527.0].head(50)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,12/1/2010 9:41,27.5,14527.0,United Kingdom
8963,537159,22112,CHOCOLATE HOT WATER BOTTLE,6,12/5/2010 13:17,4.95,14527.0,United Kingdom
8964,537159,22111,SCOTTIE DOG HOT WATER BOTTLE,1,12/5/2010 13:17,4.95,14527.0,United Kingdom
8965,537159,21479,WHITE SKULL HOT WATER BOTTLE,1,12/5/2010 13:17,3.75,14527.0,United Kingdom
8966,537159,22114,HOT WATER BOTTLE TEA AND SYMPATHY,6,12/5/2010 13:17,3.95,14527.0,United Kingdom
8967,537159,35598B,BLACK CHRISTMAS TREE 60CM,1,12/5/2010 13:17,2.95,14527.0,United Kingdom
8968,537159,22866,HAND WARMER SCOTTY DOG DESIGN,3,12/5/2010 13:17,2.1,14527.0,United Kingdom
8969,537159,22865,HAND WARMER OWL DESIGN,7,12/5/2010 13:17,2.1,14527.0,United Kingdom
8970,537159,22632,HAND WARMER RED RETROSPOT,3,12/5/2010 13:17,2.1,14527.0,United Kingdom
8971,537159,22684,FRENCH BLUE METAL DOOR SIGN 9,1,12/5/2010 13:17,1.25,14527.0,United Kingdom


In [24]:
# It seems like the first record of this customer is a discount. This might be a customer returning after a special offer.
# Let's check about other Disconts
# the discount in the record 9038 is issued just after 4 minutes of an order so these discounts must be cashbacks
# but the order on 12/9/2010 12:32 hasn't been rewarded with cashbacks
# And returning of customer ID indicates the outlate trackes customer data.

customer_df.loc[(customer_df['CustomerID']==14527.0) & (customer_df['Description']=='Discount')].head()


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,12/1/2010 9:41,27.5,14527.0,United Kingdom
9038,C537164,D,Discount,-1,12/5/2010 13:21,29.29,14527.0,United Kingdom
31663,C539003,D,Discount,-1,12/15/2010 12:35,26.93,14527.0,United Kingdom
38609,C539589,D,Discount,-1,12/20/2010 13:14,13.88,14527.0,United Kingdom
44405,C540171,D,Discount,-1,1/5/2011 12:26,22.97,14527.0,United Kingdom


In [29]:
# Let's check if other customers are rewarded with discounts.

len(customer_df.loc[customer_df['Description']=='Discount']['CustomerID'].unique())

# So 24 unique customers have been discounted.

24

In [30]:
# Let's check if StockCode D represents similar number

len(customer_df.loc[customer_df['StockCode']=='D']['CustomerID'].unique())


24

In [60]:
# Let's check the total purchase value, code 'M' represents.


print('Label M as a percentage: '+str(sum(df_value.loc[df_value['StockCode']=='D']['Price'])
                                      *100/sum(df_value['Price'])))

Label M as a percentage: -0.05843626690560821


In [93]:
suggestions_cleaning = np.append(suggestions_cleaning,
                                 'Remove Special labels as they represents small values')
suggestions_cleaning

array(['After removing special labels, check negative value representation and sum them accordint to the custoemr',
       'Remove Special labels as they represents small values'],
      dtype='<U105')

### Investigating customerID null values

In [67]:
# First Hypothesis for customerID to be null is that these are local customers

customer_df.loc[customer_df['CustomerID'].isnull()].groupby(by = 'Country').count()

Unnamed: 0_level_0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Value
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bahrain,2,2,2,2,2,2,0,2
EIRE,711,711,711,711,711,711,0,711
France,66,66,66,66,66,66,0,66
Hong Kong,288,288,288,288,288,288,0,288
Israel,47,47,47,47,47,47,0,47
Portugal,39,39,39,39,39,39,0,39
Switzerland,125,125,125,125,125,125,0,125
United Kingdom,133600,133600,132146,133600,133600,133600,0,133600
Unspecified,202,202,202,202,202,202,0,202


In [None]:
# So customer locality is not the cause
# let's observe invoice numbers for these null values.

In [71]:
customer_df.loc[customer_df['CustomerID'].isnull()].groupby(by = 'InvoiceNo').count().describe()

Unnamed: 0,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Value
count,3710.0,3710.0,3710.0,3710.0,3710.0,3710.0,3710.0,3710.0
mean,36.409704,36.01779,36.409704,36.409704,36.409704,0.0,36.409704,36.409704
std,96.098473,96.244052,96.098473,96.098473,96.098473,0.0,96.098473,96.098473
min,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
25%,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
75%,9.0,9.0,9.0,9.0,9.0,0.0,9.0,9.0
max,1114.0,1114.0,1114.0,1114.0,1114.0,0.0,1114.0,1114.0


In [86]:
# more than 50 percent of null customers generated single invoice numbers.
# let's check customers generated more tahn a single invoice.

group_df = customer_df.loc[customer_df['CustomerID'].isnull()].groupby(by = 'InvoiceNo').count()

list_multiple = group_df.loc[group_df['Value']>1].index.tolist()

for entry in list_multiple:
    print(len(customer_df.loc[customer_df['InvoiceNo']==entry]['CustomerID'].unique()))

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [87]:
# These invoice numbers cannot be assigned for known customers.

In [94]:
suggestions_cleaning = np.append(suggestions_cleaning,'Remove all null value customers')

suggestions_cleaning

array(['After removing special labels, check negative value representation and sum them accordint to the custoemr',
       'Remove Special labels as they represents small values',
       'Remove all null value customers'], dtype='<U105')

In [97]:
with open('Dataframe cleaning suggestions.txt', 'w') as f:
    for item in suggestions_cleaning:
        f.write("%s\n" % item)