In [95]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np




def read_csv(file_path):
    df = pd.read_csv(file_path)
    df = df[['Incident Date' , 'Short Description', 'Incident Type', 'Root Cause', 'Design Related Potential']]
    train_data, train_label = [], []
    test_data, test_label = [], []
    unseen_data, unseen_label = [], []
    date_data = []
    
    for index, row in df.iterrows():
        date = (str(row['Incident Date']))
        string = (str(row['Short Description']) + " " + 
                  str(row['Incident Type']) + " " + 
                  str(row['Root Cause']))
        label = str(row['Design Related Potential'])
        date_data.append(date)
        
        if label == 'y' or label == 'n':
            train_data.append(string)
            train_label.append(label)
           
            

        unseen_data.append(string)
        unseen_label.append(label)
        
    print(date_data)
    
    # split train + test data
    TRAIN_TEST = 60
    test_data = train_data[TRAIN_TEST+1:]
    test_label = train_label[TRAIN_TEST+1:]
    train_data = train_data[:TRAIN_TEST]
    train_label= train_label[:TRAIN_TEST]

    return train_data, train_label, test_data, test_label, unseen_data, unseen_label, date_data

if __name__ == "__main__":
    path = "incidents.csv"
    train_data, train_label, test_data, test_label, unseen_data, unseen_label, date_data = read_csv(path)
    
    text_clf = Pipeline([('vect', CountVectorizer(stop_words = 'english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
    #NB MODEL
    text_clf = text_clf.fit(train_data, train_label)
    #twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
    #predicted = text_clf.predict(test_data)
    #print(np.mean(predicted == test_label))
    
    #GRID SEARCH
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                   'tfidf__use_idf': (True, False),
                   'clf__alpha': (1e-2, 1e-3),
    }
    gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
    gs_clf = gs_clf.fit(train_data, train_label)

    predicted = gs_clf.predict(test_data)
    print(np.mean(predicted == test_label))
    #gs_clf.best_score_
    #gs_clf.best_params_
    
    excel = [None]*(len(train_data)+len(train_label))
    excel[::2] = train_data
    excel[1::2] = train_label
    excel_col = ["desc","design"]
    # Model validation
    
    for i in range(len(test_data)):
        #print(test_data[i], '|', predicted[i], '|', test_label[i])
        excel.append(test_data[i])
        excel.append(test_label[i])   

    # Testing against unseen 
    unseen_predicted = text_clf.predict(unseen_data)
    print(np.mean(predicted == test_label))
    no, yes = 0, 0
    for i in range(len(unseen_data)):
        #print(unseen_data[i], '|', unseen_predicted[i])
        if unseen_predicted[i] == 'y':
            yes += 1
        else:
            no += 1
        excel.append(unseen_data[i])
        excel.append(unseen_predicted[i])
    print("Yes:", yes, "No:" , no)

    '''
    excel_col = ["desc","design"]
    num = np.array(excel)
    reshaped = num.reshape(int(len(excel)/2),2)
    df1 = pd.DataFrame(reshaped, columns=excel_col)
    print(df)
    '''



['1/01/2001', '21/06/2012', '21/07/2012', '15/11/2012', '4/12/2012', '6/12/2012', '2/01/2013', '4/01/2013', '4/01/2013', '10/01/2013', '28/01/2013', '1/02/2013', '6/02/2013', '7/02/2013', '11/02/2013', '12/02/2013', '14/02/2013', '15/02/2013', '19/02/2013', '21/02/2013', '25/02/2013', '27/02/2013', '28/02/2013', '1/03/2013', '4/03/2013', '11/03/2013', '12/03/2013', '13/03/2013', '20/03/2013', '21/03/2013', '22/03/2013', '26/03/2013', '27/03/2013', '27/03/2013', '27/03/2013', '10/04/2013', '13/04/2013', '14/04/2013', '15/04/2013', '15/04/2013', '18/04/2013', '24/04/2013', '30/04/2013', '1/05/2013', '1/05/2013', '1/05/2013', '8/05/2013', '9/05/2013', '14/05/2013', '17/05/2013', '21/05/2013', '22/05/2013', '22/05/2013', '22/05/2013', '23/05/2013', '27/05/2013', '27/05/2013', '30/05/2013', '3/06/2013', '8/06/2013', '11/06/2013', '20/06/2013', '20/06/2013', '20/06/2013', '24/06/2013', '25/06/2013', '26/06/2013', '27/06/2013', '28/06/2013', '3/07/2013', '4/07/2013', '9/07/2013', '10/07/2013'



0.9285714285714286
0.9285714285714286
Yes: 108 No: 458


ValueError: attempt to assign sequence of size 566 to extended slice of size 653