In [None]:

import pickle
import pickleshare
import pandas as pd
import numpy as np
from unittest.mock import inplace
from unittest.mock import patch
class PreProcess(object):
    def gen_courseid_dict(self,source_path):
        # source_path = 'original_data/train/date.csv'

        df = pd.read_csv(source_path,usecols=[0])
        course_map = pd.factorize(df.course_id)[1]
        course_dict = dict(zip(course_map,range(len(course_map))))
        print ("course_dict done...")
        return course_dict

    def gen_username_dict(self,source_path_train,source_path_test):
        # source_path_train = 'original_data/train/enrollment_train.csv'
        # source_path_test = 'original_data/test/enrollment_test.csv'

        df = pd.read_csv(source_path_train,usecols=[1])
        username_map = pd.factorize(df.username)[1]
        username_dict = dict(zip(username_map,range(len(username_map))))

        df2 = pd.read_csv(source_path_test,usecols=[1])
        username_map2 = pd.factorize(df2.username)[1]
        diff = [w for w in username_map2 if w not in username_map]
        username_dict2 =dict(zip(diff,np.arange(len(username_map),len(username_map)+len(diff))))

        username_dict.update(username_dict2)
        print ("username_dict done...")
        return username_dict

    def course_map(self,x):
        return self.course_dict[x]

    def username_map(self,x):
        return self.username_dict[x]

    def time_split(self,x):
        x = x[:10]
        return x

    def enrollment_map(self,source_path_train,source_path_test,target_path_train,target_path_test):
        print ("read enrollment_train.csv")
#从两个CSV文件中读取特定的列，对这两列数据进行转换，然后将转换后的数据保存到另外两个CSV文件中
        # source_path_train = 'original_data/train/enrollment_train.csv'
        # source_path_test = 'original_data/test/enrollment_test.csv'
        #
        # target_path_train = "preprocess_data/enrollment_train#.csv"
        # target_path_test = "preprocess_data/enrollment_test#.csv"

        df1 = pd.read_csv(source_path_train,usecols=[0,1,2],converters={1:self.username_map,2:self.course_map})
        df1.to_csv(target_path_train,index=False)
        df2 = pd.read_csv(source_path_test,usecols=[0,1,2],converters={1:self.username_map,2:self.course_map})
        df2.to_csv(target_path_test,index=False)

    def date_map(self,source_path,target_path):
        # 从一个CSV文件中读取日期信息，计算两个日期之间的天数差，并将结果保存到另一个CSV文件中
        # source_path = 'original_data/train/date.csv'
        # target_path = "preprocess_data/date#.csv"

        print ("read date.csv")
        df1 = pd.read_csv(source_path,converters={0:self.course_map})
        df1["day_nums"]= (pd.to_datetime(df1["to"]) - pd.to_datetime(df1["from"]))
        df1["day_nums"] = df1["day_nums"].map(lambda x: x.days)#计算这两个日期之间的差值，并将结果存储在"day_nums"这一新列中
        df1.to_csv(target_path,index=False)

    def log_clean(self,source_path,target_path):
        #从CSV文件中读取特定的列，对日期和时间字段进行分割处理，然后将处理后的数据保存到另一个CSV文件中
        # source_path = 'original_data/train/log_train.csv'
        # target_path = "preprocess_data/log_train#.csv"

        print ("read log_train.csv ")
        df1 = pd.read_csv(source_path,usecols=[0,1,3]) #change#使用usecols参数指定只读取第0、1、3列。这意味着我们只对CSV文件中的这三列数据感兴趣，其他的列将被忽略。
        #source、object列将被忽略
        df1["date"] = df1["time"].map(lambda x: x[:10])#对time列时间进行处理
        df1["time"] = df1["time"].map(lambda x: x[11:])
        df1.to_csv(target_path,index=False)



    def course_enrollment(self,source_path_train,source_path_test,source_path_date,target_path_train,target_path_test):
        #将两个包含课程注册信息的CSV文件（一个训练集和一个测试集）与一个包含日期信息的CSV文件合并，并将合并后的数据保存到另外两个CSV文件中。
        # source_path_train = 'preprocess_data/enrollment_train#.csv'
        # source_path_test = 'preprocess_data/enrollment_test#.csv'
        # source_path_date = 'preprocess_data/date#.csv'
        # target_path_train = "preprocess_data/course_enrollment_train#.csv"
        # target_path_test = "preprocess_data/course_enrollment_test#.csv"

        print("course_enrollment....")
        df1 = pd.read_csv(source_path_train) #如果不设置index，read_csv读取是默认index(序号)，不是第一列
        df2 = pd.read_csv(source_path_test)
        df3 = pd.read_csv(source_path_date)
        df4 = pd.merge(df1,df3,how="left",left_on="course_id",right_on="course_id")

        df5 = pd.merge(df2,df3,how="left",on="course_id")
        df4.to_csv(target_path_train,index=False)
        df5.to_csv(target_path_test,index=False)

    def log_interval(self,source_path_log_train,source_path_enrol_train,target_path):
        #计算两个不同CSV文件之间的时间间隔，并将结果保存到另一个CSV文件中
        # source_path_log_train = 'preprocess_data/log_train#.csv'
        # source_path_enrol_train = 'preprocess_data/course_enrollment_train#.csv'
        # target_path = "preprocess_data/log_train_final#.csv"
        print("log_interval....")
        df1 = pd.read_csv(source_path_log_train)
        df2 = pd.read_csv(source_path_enrol_train,usecols=[0,3])
        df3 = pd.merge(df1,df2,how="left",on="enrollment_id")
        df3["interval"]= (pd.to_datetime(df3["date"]) - pd.to_datetime(df3["from"]))
        df3["interval"] = df3["interval"].map(lambda x: x.days+1)#interval列现在就包含了两个日期之间的天数差（加1）
        df3.drop(["from"],axis=1,inplace=True)
        df3.to_csv(target_path,index=False)

    def enrollment_dropout(self,source_path_enrol_train,source_path_truth,target_path):
        #用于处理学生注册和辍学数据
        # source_path_enrol_train = 'preprocess_data/enrollment_train#.csv'
        # source_path_truth = 'original_data/train/truth_train.csv'
        # target_path = "preprocess_data/enrollment_dropout#.csv"

        print("merge_enrollment")
        df1 = pd.read_csv(source_path_enrol_train)
        df2 = pd.read_csv(source_path_truth,names=['enrollment_id','dropout'])
        df3 = pd.merge(df1,df2,how="left",on="enrollment_id")
        gpby_user = df3.groupby("username");
        df4 = gpby_user.course_id.count().to_frame()
        df4.rename(columns={'course_id':'course_num'}, inplace = True)
        gpby_user_dropout = df3.groupby(["username","dropout"]);
        df5 = gpby_user_dropout.course_id.count().unstack().fillna(0)
        df5.rename(columns={0:'nondropout_num', 1:'dropout_num'}, inplace = True)
        df5.drop(["dropout_num"],axis=1,inplace=True)
        df6 = pd.merge(df1,df4,how="left",left_on="username",right_index=True)
        df6 = pd.merge(df6,df5,how="left",left_on="username",right_index=True)
        df7 = pd.merge(df6,df2,how="left",on="enrollment_id")
        df7.to_csv(target_path,index=False)

    def data_trans(self):

        source_path = 'original_data/train/date.csv'
        self.course_dict = self.gen_courseid_dict(source_path)

        source_path_train = 'original_data/train/enrollment_train.csv'
        source_path_test = 'original_data/test/enrollment_test.csv'
        self.username_dict = self.gen_username_dict(source_path_train,source_path_test)

        source_path_train='original_data/train/enrollment_train.csv'
        source_path_test='original_data/test/enrollment_test.csv'

        target_path_train="preprocess_data/enrollment_train#.csv"
        target_path_test="preprocess_data/enrollment_test#.csv"
        self.enrollment_map(source_path_train,source_path_test,target_path_train,target_path_test)

        source_path='original_data/train/date.csv'
        target_path="preprocess_data/date#.csv"
        self.date_map(source_path,target_path)

        source_path='original_data/train/log_train.csv'
        target_path="preprocess_data/log_train#.csv"
        self.log_clean(source_path,target_path)

        source_path_train='preprocess_data/enrollment_train#.csv'
        source_path_test='preprocess_data/enrollment_test#.csv'
        source_path_date='preprocess_data/date#.csv'
        target_path_train="preprocess_data/course_enrollment_train#.csv"
        target_path_test="preprocess_data/course_enrollment_test#.csv"
        self.course_enrollment(source_path_train,source_path_test,source_path_date,target_path_train,target_path_test)

        source_path_log_train='preprocess_data/log_train#.csv'
        source_path_enrol_train='preprocess_data/course_enrollment_train#.csv'
        target_path="preprocess_data/log_train_final#.csv"
        self.log_interval(source_path_log_train,source_path_enrol_train,target_path)

        source_path_enrol_train='preprocess_data/enrollment_train#.csv'
        source_path_truth='original_data/train/truth_train.csv'
        target_path="preprocess_data/enrollment_dropout#.csv"
        self.enrollment_dropout(source_path_enrol_train,source_path_truth,target_path)


In [None]:

import pandas as pd
import datetime
from functools import reduce
import codecs
import csv
from decimal import *
import numpy as np

class FeatureEngineering(object):

    ##辍学概率=没有辍学的课程数/课程总数
    def nondrop_precent(self,source_path,target_path):
        #计算非辍学学生的比例，并将结果保存到enrollment_nondrop_precent#.csv文件中
        # source_path = 'preprocess_data/enrollment_dropout#.csv'
        # target_path = "feature/enrollment_nondrop_precent#.csv"

        print("nondrop_precent...")
        df1 = pd.read_csv(source_path)
        df1["nondrop_precent"]=df1["nondropout_num"]/df1["course_num"]
        df1.to_csv(target_path,index=False)
        
    def add(self,x,y):
        #接收两个参数x和y，并返回它们的和
        return x+y


        ##op_character 操作字符
    def op_character(self,source_path,target_path):
        #分析和提取有关用户操作的特征。具体来说，它处理一个包含用户操作日志的CSV文件，并生成一个新的CSV文件，
        # 其中包含每个用户（通过enrollment_id标识）在每个时间间隔（interval）内的操作特征。
        # source_path = 'preprocess_data/log_train_final#.csv'
        # target_path = "feature/log_feature#.csv"

        print("op_character...")
        df1 = pd.read_csv(source_path)
        gpby_enrol = df1.groupby("enrollment_id")#使用groupby方法按enrollment_id对df1进行分组，得到每个用户的操作数据
        
        enrol_list = list()
        interval_list = list()
        last_minutes = list()
        valid_opnum = list()
        all_opnum = list()
        
        for enrollment_id,group in gpby_enrol:
            group.groupby("interval")#对于每个enrollment_id及其对应的操作数据group，
                # 进一步按interval进行分组，得到每个用户在每个时间间隔内的操作数据group2

            # 提取和计算特征
            # 对于每个时间间隔内的操作数据group2，提取以下特征：
            # enrollment_id：用户ID。
            # interval：时间间隔。
            # last_minutes：该时间间隔内第一个和最后一个操作之间的时间差（以分钟为单位）。
            # valid_opnum：该时间间隔内有效操作的总数（这里的有效操作包括problem、video、wiki和discussion四种类型）。
            # all_opnum：该时间间隔内所有操作的总数。
            # 将这些特征值添加到对应的列表中

            for interval,group2 in group.groupby('interval'):
                enrol_list.append(enrollment_id)
                
                interval_list.append(interval)
                timelist = group2.time.tolist()
                h1 = datetime.datetime.strptime(timelist[0],'%H:%M:%S')
                h2 = datetime.datetime.strptime(timelist[len(timelist)-1],'%H:%M:%S')
                hh = h2-h1
                last_minutes.append(hh.seconds/60+1)
                valid_len = [0,0,0,0]
                valid_len[0] = len(group2[group2.event=='problem'])
                valid_len[1] = len(group2[group2.event=='video'])
                valid_len[2] = len(group2[group2.event == 'wiki'])
                valid_len[3] = len(group2[group2.event == 'discussion'])
                valid_opnum.append(reduce(self.add,valid_len))
                all_opnum.append(len(group2))

        # 创建并保存新的DataFrame:

        # 使用这些特征列表创建一个新的DataFramedf2。
        # 调整df2的列顺序，使其与期望的顺序相匹配。
        # 使用to_csv函数将df2保存到target_path指定的路径，其中index = False表示在保存时不包含DataFrame的索引。
        df2 = pd.DataFrame({"enrollment_id":enrol_list,"interval":interval_list,"last_minutes":last_minutes,"valid_opnum":valid_opnum,"all_opnum":all_opnum})
        df2 = df2[["enrollment_id","interval","last_minutes","valid_opnum","all_opnum"]]
        df2.to_csv(target_path,index=False)

    ##op_of_day 用户操作时间
    def op_of_day(self,source_path,target_path):
        #从用户操作日志中提取每日的操作特征，并将这些特征写入一个新的CSV文件中。
        # source_path = "feature/log_feature#.csv"
        # target_path = "feature/log_feature_final.csv"

        print("op_of_day...")
        log_file = codecs.open(source_path,'r','utf-8')
        log_final_file = codecs.open(target_path,'w+','utf-8')
        framedata1 = pd.read_csv(log_file)
        writer = csv.writer(log_final_file,delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writedata = list()
        for i in range(0,111):
            writedata.append('')
        writedata[0]="enrollment_id"
        index =1
        for i in range(1,31):
            writedata[i]="all_opnum_"+str(i)
            writedata[i+30]="valid_opnum_"+str(i)
            writedata[i+60]="last_minutes_"+str(i)
            index += 3
        name_array1 = ["pre","mid","last","thirty_day"]
        name_array2 = ["min","max","sum","mean","std"]
        for name1 in name_array1:
            for name2 in name_array2:
                writedata[index]=name1+"_"+name2
                index += 1
                
        writer.writerow(writedata)    
                
        for enrollment_id,group in framedata1.groupby('enrollment_id'):
            writedata[0]=enrollment_id
            interval_list = group.interval.tolist()
            last_minutes_list = group.last_minutes.tolist()
            valid_num_list = group.valid_opnum.tolist()
            all_num_list = group.all_opnum.tolist()
            tag = 0
            for i in range(1,31):
                if i in interval_list:  #如果用户今天参加课程
                    writedata[i] = all_num_list[tag] #第一个30特征，记录操作总次数
                    writedata[i+30] = valid_num_list[tag] #第二个30特征，记录有效操作次数
                    writedata[i+60] = last_minutes_list[tag] #第三个30特征，记录持续时间
                    tag = tag + 1
                else:   #如果用户今天没有操作
                    writedata[i] = 0
                    writedata[i+30] = 0
                    writedata[i+60] = 0
            tag = 0
            
            '''
                            分前中后三个阶段统计总操作次数特征
            '''
            preall = list()
            midall = list()
            lastall = list()
            for i in range(1, 31):
                if i in interval_list:
                    if i > 0 and i <= 10:
                        preall.append(all_num_list[tag])
                    if i > 10 and i <= 20:
                        midall.append(all_num_list[tag])
                    if i > 20 and i <= 30:
                        lastall.append(all_num_list[tag])
                    tag = tag + 1
                else:
                    if i > 0 and i <= 10:
                        preall.append(0)
                    if i > 10 and i <= 20:
                        midall.append(0)
                    if i > 20 and i <= 30:
                        lastall.append(0)
    
            ########处理前十天的相关统计#######
            writedata[91] = min(preall)   #前十天中最小的操作次数
            writedata[92] = max(preall)     #前十天中最大的操作次数
            writedata[93] = np.array(preall).sum()    #前十天的总操作总次数
            writedata[94] = int(np.array(preall).mean())#前十天的平均次数
            writedata[95] = Decimal(np.array(preall).std()).quantize(Decimal('0.00'))        #操作次数的标准差
    
            #########处理中间十天的相关统计#########
            writedata[96] = min(midall)
            writedata[97] = max(midall)
            writedata[98] = np.array(midall).sum()
            writedata[99] = int(np.array(midall).mean())
            writedata[100]=  Decimal(np.array(midall).std()).quantize(Decimal('0.00'))
    
            ########处理后十天的相关统计############
            writedata[101] = min(lastall)
            writedata[102] = max(lastall)
            writedata[103] = np.array(lastall).sum()
            writedata[104] = int(np.array(lastall).mean())
            writedata[105] =  Decimal(np.array(lastall).std()).quantize(Decimal('0.00'))
            ########处理三十天的相关统计############
            tag = 0
            writedata[106] = min(all_num_list)
            writedata[107] = max(all_num_list)
            templist = all_num_list
            writedata[108] = np.array(templist).sum()
            for i in range(0,30-len(all_num_list)):
                templist.append(0)
            writedata[109] = int(np.array(templist).mean())
            writedata[110] = Decimal(np.array(templist).std()).quantize(Decimal('0.00'))
            #print ('正在处理中....',enrollment_id)
            writer.writerow(writedata)  #写入文件


    ##feature_all  所有的特征
    def feature_all(self,source_path1,source_path2,target_path):
        #合并两个CSV文件，并将合并后的结果保存到一个新的CSV文件中
        # source_path1 = 'feature/log_feature_final.csv'
        # source_path2 = 'feature/enrollment_nondrop_precent#.csv'
        # target_path = "feature/final_feature_all.csv"

        print("feature_all...")
        df1 = pd.read_csv(source_path1)
        df2 = pd.read_csv(source_path2)
        df3 = pd.merge(df1,df2,on="enrollment_id",how="left")
        df3.to_csv(target_path,index=False)


        ##ext_feature扩展特征
    def ext_feature(self):
        source_path='preprocess_data/enrollment_dropout#.csv'
        target_path="feature/enrollment_nondrop_precent#.csv"
        self.nondrop_precent(source_path,target_path)
        
        source_path='preprocess_data/log_train_final#.csv'
        target_path="feature/log_feature#.csv"
        self.op_character(source_path,target_path)
        
        source_path="feature/log_feature#.csv"
        target_path="feature/log_feature_final.csv"
        self.op_of_day(source_path,target_path)
        
        source_path1='feature/log_feature_final.csv'
        source_path2='feature/enrollment_nondrop_precent#.csv'
        target_path="feature/final_feature_all.csv"
        self.feature_all(source_path1,source_path2,target_path)


In [None]:

from __future__ import unicode_literals
import codecs
import numpy as np
import pylab as pl
from itertools import *
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
##from sklearn import cross_validation
from sklearn.model_selection import cross_val_score

import pandas as pd
from sklearn.preprocessing import scale
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier 
from sklearn.ensemble import RandomForestClassifier


class DropoutPredict(object):
    def loadData(self,filename):
        print("loadData...")
        df1 = pd.read_csv(filename)
        df1.drop(["enrollment_id","course_id"],axis=1)
        df2 = df1.drop("dropout",inplace=False,axis=1)
        x = df2.values #DataFrame的值组成的二维数组
        x = scale(x) #去均值后规范化
        y = np.ravel(df1["dropout"])
        return x,y
            
    def logistic_regression(self,x_train,y_train):
        print("logistic_regression...")
        clf1 = LogisticRegression()
        # y_pred = clf1.predict(x_train)
        ## score1 = cross_validation.cross_val_score(clf1,x_train,y_train,cv=10,scoring="accuracy")
        score1 = cross_val_score(clf1, x_train, y_train, cv=10, scoring="accuracy")
        x = [int(i) for i in range(1,11)]
        y = score1
        pl.ylabel(u'Accuracy')
        pl.xlabel(u'times')
        pl.plot(x,y,label='LogReg')
        pl.legend()
        pl.savefig("picture/LogReg.png")
        print (np.mean(score1))

        
    def svm(self,x_train,y_train):
        print("svm...")
        clf2 = svm.LinearSVC(random_state=2016)
       ## score2 = cross_validation.cross_val_score(clf2,x_train,y_train,cv=10,scoring='accuracy')
        score2 = cross_val_score(clf2, x_train, y_train, cv=10, scoring='accuracy')
        #print score2
        print ('The accuracy of linearSVM:')
        print (np.mean(score2))
        x = [int(i) for i in range(1, 11)]
        y = score2
        pl.ylabel(u'Accuracy')
        pl.xlabel(u'times')
        pl.plot(x, y,label='SVM')
        pl.legend()
        pl.savefig("picture/SVM.png")
        
    def naive_bayes(self,x_train,y_train):
        print("naive_bayes...")      
        clf3 = GaussianNB()
       ## score3 =  cross_validation.cross_val_score(clf3,x_train,y_train,cv=10,scoring='accuracy')
        score3 = cross_val_score(clf3, x_train, y_train, cv=10, scoring='accuracy')
        print ("The accuracy of Naive Bayes:")
        print (np.mean(score3))
        x = [int(i) for i in range(1, 11)]
        y = score3
        pl.ylabel(u'Accuracy')
        pl.xlabel(u'times')
        pl.plot(x, y,label='NB')
        pl.legend()
        pl.savefig("picture/NB.png")  
          
    def decision_tree(self,x_train,y_train):
        print("decision_tree...") 
        clf4 = tree.DecisionTreeClassifier()
       ## score4 = cross_validation.cross_val_score(clf4,x_train,y_train,cv=10,scoring="accuracy")
        score4 = cross_val_score(clf4, x_train, y_train, cv=10, scoring="accuracy")

        print ('The accuracy of DT:')
        print (np.mean(score4))
        x = [int(i) for i in range(1, 11)]
        y = score4
        pl.ylabel(u'Accuracy')
        pl.xlabel(u'times')
        pl.plot(x, y,label='DT')
        pl.legend()
        pl.savefig("picture/DT.png")
        
    def gradient_boosting(self,x_train,y_train):
        print("gradient_boosting...")     
        clf5 = GradientBoostingClassifier()
        ##score5 = cross_validation.cross_val_score(clf5,x_train,y_train,cv=10,scoring="accuracy")
        score5 = cross_val_score(clf5, x_train, y_train, cv=10, scoring="accuracy")
        print ('The accuracy of GradientBoosting:')
        print (np.mean(score5))
        x = [int(i) for i in range(1, 11)]
        y = score5
        pl.ylabel(u'Accuracy')
        pl.xlabel(u'times')
        pl.plot(x, y,label='GBDT')
        pl.legend()
        pl.savefig("picture/GBDT.png")
    def mlp(self,x_train,y_train):   
        print("mlp...") 
        clf = MLPClassifier(hidden_layer_sizes=(1000,),
                            activation='logistic', solver='sgd',
                            learning_rate_init = 0.001, max_iter=100000)
        score = cross_val_score(clf,x_train,y_train,cv=10,scoring="accuracy")
      ##  score = cross_validation.cross_val_score(clf, x_train, y_train, cv=10, scoring="accuracy")
        print ('The accuracy of MLP:')
        print (np.mean(score))
        x = [int(i) for i in range(1, 11)]
        y = score
        pl.ylabel(u'Accuracy')
        pl.xlabel(u'times')
        pl.plot(x, y,label='MLP')
        pl.legend()
        pl.savefig("picture/MLP.png")
        
    def random_forest(self,x_train,y_train): 
        print("random_forest...")        
        clf = RandomForestClassifier(n_estimators=100)   
       ## score = cross_validation.cross_val_score(clf,x_train,y_train,cv=10,scoring="accuracy")
        ##维度10
        score = cross_val_score(clf, x_train, y_train, cv=10, scoring="accuracy")
        ##维度30
        # score = cross_val_score(clf, x_train, y_train, cv=30, scoring="accuracy")

        print ('The accuracy of RandomForest:')
        print (np.mean(score))

        #x = [int(i) for i in range(1, 11)]
        x = [int(i) for i in range(1, 31)]
        y = score
        pl.ylabel(u'Accuracy')
        pl.xlabel(u'times')
        pl.plot(x, y,label='RandForest')
        pl.legend()
        pl.savefig("picture/RandomForest.png")
        
    def drop_predict(self):
        ##调用函数展示图片

        ##使用的是final_feature_all.csv
        filename = 'feature/final_feature_all.csv'
        x_train,y_train = self.loadData(filename)

        self.logistic_regression(x_train,y_train)
        self.svm(x_train,y_train)
        self.naive_bayes(x_train,y_train)
        self.decision_tree(x_train,y_train)
        
        self.gradient_boosting(x_train,y_train)
        

        self.mlp(x_train,y_train)
        self.random_forest(x_train,y_train)

In [None]:
from data_trans import PreProcess

from ext_feature import FeatureEngineering
from dropout_predict import DropoutPredict
from sklearn.model_selection import cross_validate



if __name__=='__main__':
    '''
            程序入口，只需要执行该代码，即可完成数据预处理，特征抽取，预测分类
            ##如果特征数据已经有了，可以把预处理部分注释，直接运行分类预测
    '''
    '''
    #数据预处理
    preprocess = PreProcess()
    preprocess.data_trans()

    #抽取特征
    feature_engineering = FeatureEngineering()
    feature_engineering.ext_feature()
    '''
    #预测分类
    prediction = DropoutPredict()
    prediction.drop_predict()
    print("...done...")