![title](bw.JPG)

# Problem Statement

** A financial institution news agency has collected 3000 news articles that relates to several matters of financial importance. Before analyzing these unlabeled news, it is only fair to try to partition them into some sort of logical groupings based on their similarities.**

**Your task is to use appropriate unsupervised machine learning algorithm to form the news clusters based on their similarity. Prior to clustering it is recommended to perform basic natural language processing steps such as stemming, tokenization and word vectorization for best results.  **

Notes to keep in mind:

There are no duplicate rows in the dataset.

Cluster number should start from 0.

### Data Description
There is only one file news.csv that contains date, headlines and text of the news.

|Column|Description|
|------|------|
|id|The unique id of the news|
|headline|The headline of the news in text|
|text|The body of the news in text|


### Submission
The submission file should be a zip containing a .txt and .csv file. Both should have 3000 rows.

.txt file should contain the matrix / ndarrays you are using to create clusters.
.csv should contain the cluster of customers against every store.
Format of the csv file:

|id|cluster|
|------|------|
|uid-1|0|
|uid-2|0|
|uid-3|1|
|uid-4|1|
|uid-5|4|

### Evaluation Metric
The submissions will be evaluated on silhouette score. Please read more about the metrics here.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/c3cc8568-0-dataset"))

# Any results you write to the current directory are saved as output.

In [None]:
train1=pd.read_csv('../input/c3cc8568-0-dataset/train.csv')
test1=pd.read_csv('../input/c3cc8568-0-dataset/test.csv')

In [None]:
train1.shape


In [None]:
wt=dict(1-train1['Complaint-Status'].value_counts()/train1.shape[0])
wt

In [None]:
train1.isnull().sum()

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
def clean_text(raw_text):
    raw_text=raw_text.strip()
    try:
        no_encoding=raw_text.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        no_encoding = raw_text
    letters_only = re.sub("[^a-zA-Z]", " ",no_encoding) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    meaningful_words1=[stemmer.stem(word) for word in meaningful_words]
    return( " ".join( meaningful_words1 )) 

In [None]:
# import py-translate
# translator = Translator()
# from nltk.misc import babelfish
# smpl=train1['Consumer-complaint-summary'].sample(1,random_state=1994).values
# print(smpl)

# [w for w in smpl if not w in set(stopwords.words("french")) ]
# babelfish.translate(smpl)
# print(translator.translate(smpl))

In [None]:
def dateSim(val):
    if val==0:
        return 1
    else:
        return 0

train=train1.copy()
train['Date-received']=pd.to_datetime(train['Date-received'])
train['Date-sent-to-company']=pd.to_datetime(train['Date-sent-to-company'])
train['diff'] = train['Date-sent-to-company'] - train['Date-received']
train['diff_days']=train['diff']/np.timedelta64(1,'D')
train['diff_year']=train['diff']/np.timedelta64(1,'Y')
train['diff_m']=train['diff']/np.timedelta64(1,'M')
# train['diff_w']=train['diff']/np.timedelta64(1,'W')
train['Company-response'].fillna('None',inplace=True)
train['Consumer-disputes'].fillna('Other',inplace=True)
train['Consumer-complaint-summary']=train['Consumer-complaint-summary'].apply(clean_text)
train['Complaint-reason']=train['Complaint-reason'].apply(clean_text)
train['isSameDay']=train['diff_days'].apply(dateSim)


train['Complaint-reasonLen']=train['Complaint-reason'].apply(len)
train['Consumer-complaint-summaryLen']=train['Consumer-complaint-summary'].apply(len)

train.drop(['Date-sent-to-company','Date-received','diff'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
import gc
gc.collect()

In [None]:
train=pd.get_dummies(train,columns=['Transaction-Type','Company-response','Consumer-disputes'],drop_first=True)
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
vec_cr = TfidfVectorizer(ngram_range=(1,2),stop_words="english", analyzer='word')
comp_reason =vec_cr.fit_transform(train['Complaint-reason'])

vec_cr_char = TfidfVectorizer(ngram_range=(1,8),stop_words="english", analyzer='char')
comp_reasonChar =vec_cr_char.fit_transform(train['Complaint-reason'])

vec_cs = TfidfVectorizer(ngram_range=(1,3),stop_words="english", analyzer='word')
consum_comp_sum =vec_cs.fit_transform(train['Consumer-complaint-summary'])

vec_csChar = TfidfVectorizer(ngram_range=(1,9),stop_words="english", analyzer='char')
consum_comp_sumChar =vec_csChar.fit_transform(train['Consumer-complaint-summary'])

In [None]:
feats=[ 'diff_days', 'diff_year', 'diff_m','Complaint-reasonLen','Consumer-complaint-summaryLen',
       'Transaction-Type_Checking or savings account',
       'Transaction-Type_Consumer Loan', 'Transaction-Type_Credit card',
       'Transaction-Type_Credit card or prepaid card',
       'Transaction-Type_Credit reporting',
       'Transaction-Type_Credit reporting, credit repair services, or other personal consumer reports',
       'Transaction-Type_Debt collection',
       'Transaction-Type_Money transfer, virtual currency, or money service',
       'Transaction-Type_Money transfers', 'Transaction-Type_Mortgage',
       'Transaction-Type_Other financial service',
       'Transaction-Type_Payday loan',
       'Transaction-Type_Payday loan, title loan, or personal loan',
       'Transaction-Type_Prepaid card', 'Transaction-Type_Student loan',
       'Transaction-Type_Vehicle loan or lease',
       'Transaction-Type_Virtual currency',
       'Company-response_Company believes complaint is the result of an isolated error',
       'Company-response_Company believes complaint relates to a discontinued policy or procedure',
       'Company-response_Company believes complaint represents an opportunity for improvement to better serve consumers',
       'Company-response_Company believes it acted appropriately as authorized by contract or law',
       'Company-response_Company believes the complaint is the result of a misunderstanding',
       "Company-response_Company can't verify or dispute the facts in the complaint",
       'Company-response_Company chooses not to provide a public response',
       'Company-response_Company disputes the facts presented in the complaint',
       'Company-response_Company has responded to the consumer and the CFPB and chooses not to provide a public response',
       'Company-response_None', 'Consumer-disputes_Other',
       'Consumer-disputes_Yes','isSameDay']

In [None]:
from scipy.sparse import csr_matrix
from scipy import sparse
final_features = sparse.hstack((train[feats], comp_reason, consum_comp_sum,comp_reasonChar,consum_comp_sumChar)).tocsr()

In [None]:
final_features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
X=final_features
y=train['Complaint-Status']
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.3,random_state = 1994)

In [None]:
# wt={'Closed with explanation': 0.9,
#  'Closed with non-monetary relief': 0.5,
#  'Closed with monetary relief': 0.8,
#  'Closed': 0.8,
#  'Untimely response': 0.8}

In [None]:
# import xgboost as xgb
# clf = xgb.XGBClassifier(
# #                 max_depth = 5,
#                 n_estimators=1000,
# #                 learning_rate=0.1, 
# #                 nthread=4,
# #                 subsample=1.0,
# #                 colsample_bytree=0.5,
# #                 min_child_weight = 3,
# #                 scale_pos_weight = ratio,
# #                 reg_alpha=0.03,
#                 seed=1994,verbose_eval=100)
                
# clf.fit(X_train, y_train, early_stopping_rounds=50, eval_metric="mlogloss",
#         eval_set=[(X_train, y_train), (X_val, y_val)])
        
# p=clf.predict(X_val, ntree_limit=clf.best_iteration)
# print(f1_score(y_val,p,average='weighted'))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# lr=LogisticRegression(verbose=10,class_weight='balanced',C=5,random_state=1994,n_jobs=-1,intercept_scaling=2)
# lr.fit(X_train,y_train)
# lrpred=lr.predict(X_val)
# print(f1_score(y_val,lrpred,average='weighted'))

In [None]:
# xgb=XGBClassifier()
# xgb.fit(X_train.tocsc(),y_train)
# cbpred=xgb.predict(X_val.to_csc())
# print(f1_score(y_val,cbpred,average='weighted'))

# from sklearn.neural_network import MLPClassifier
# clf = MLPClassifier(verbose=10)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_val)
# print(f1_score(y_val,y_pred,average='weighted'))

In [None]:
test=test1.copy()
test['Date-received']=pd.to_datetime(test['Date-received'])
test['Date-sent-to-company']=pd.to_datetime(test['Date-sent-to-company'])
test['diff'] = test['Date-sent-to-company'] - test['Date-received']
test['diff_days']=test['diff']/np.timedelta64(1,'D')
test['diff_year']=test['diff']/np.timedelta64(1,'Y')
test['diff_m']=test['diff']/np.timedelta64(1,'M')
test['diff_w']=test['diff']/np.timedelta64(1,'W')
test['Company-response'].fillna('None',inplace=True)
test['Consumer-disputes'].fillna('Other',inplace=True)
test['Consumer-complaint-summary']=test['Consumer-complaint-summary'].apply(clean_text)
test['Complaint-reason']=test['Complaint-reason'].apply(clean_text)
test['isSameDay']=test['diff_days'].apply(dateSim)

test['Complaint-reasonLen']=test['Complaint-reason'].apply(len)
test['Consumer-complaint-summaryLen']=test['Consumer-complaint-summary'].apply(len)

test.drop(['Date-sent-to-company','Date-received','diff'],axis=1,inplace=True)
test.head()

In [None]:
test=pd.get_dummies(test,columns=['Transaction-Type','Company-response','Consumer-disputes'],drop_first=True)
comp_reason_test =vec_cr.transform(test['Complaint-reason'])
consum_comp_sum_test =vec_cs.transform(test['Consumer-complaint-summary'])


comp_reason_testchar =vec_cr_char.transform(test['Complaint-reason'])
consum_comp_sum_testchar =vec_csChar.transform(test['Consumer-complaint-summary'])

In [None]:
final_features_test = sparse.hstack((test[feats], comp_reason_test, consum_comp_sum_test,comp_reason_testchar,consum_comp_sum_testchar)).tocsr()
final_features_test

In [None]:
lr=LogisticRegression(verbose=1,class_weight='balanced',C=5,random_state=1996,n_jobs=-1)
lr.fit(final_features,train['Complaint-Status'].values)
lrpred=lr.predict(final_features_test)

In [None]:
# preds=[]
# from sklearn.model_selection import StratifiedKFold
# kf = StratifiedKFold(n_splits=3,random_state=1994,shuffle=True)
# for train_index,test_index in kf.split(X,y):
# #     print('\n{} of kfold {}'.format(i,kf.n_splits))
#     Xtrain,Xtest = X[train_index],X[test_index]
#     ytrain,ytest = y[train_index],y[test_index]
# #     print(Xtrain.shape,Xtest.shape)
# #     print(ytrain.shape,ytest.shape)
#     lr=LogisticRegression(verbose=1,class_weight='balanced',C=5,random_state=1994,n_jobs=-1)
#     lr.fit(Xtrain,ytrain)
#     lrpred=lr.predict(final_features_test)
#     preds.append(lrpred)

In [None]:
# for i in range(len(preds)):
#     s=pd.DataFrame({'Complaint-ID':test['Complaint-ID'],'Complaint-Status':preds[i]})
#     s.to_csv('lrsKfolds'+str(i)+'.csv',index=False)

In [None]:
s=pd.DataFrame({'Complaint-ID':test['Complaint-ID'],'Complaint-Status':lrpred})
s.head()

In [None]:
s.to_csv('lrs9.csv',index=False)

In [None]:
# s['Complaint-Status']=mbpred
# s.to_csv('mbs1.csv',index=False)