In [22]:
import os
import pandas as pd
import numpy as np
import calendar

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

import gensim
import pyLDAvis.gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

np.random.seed(400)

# External running pythons
# %run -i python-code/nlp.py

# Folder path
folder = './run_data/'
folder_traing = './training/'

In [13]:
# 
entries = os.listdir(folder_traing)
print("Total files: ", len(entries))

# Print out file names
print([entry for entry in entries[:5]])

Total files:  23318
['501510283588685824.txt', '564880695911124992.txt', '454427476240769025.txt', '489080940820701184.txt', '296124319132708864.txt']


In [14]:
# Read file and push to list
list_text = []
# list_text.append([f.read() with open(file,'r') as f for file in entries])

for entry in entries:
    file = folder_traing + entry
    with open(file, 'r') as f:
        data = f.read()
        print("Read: ", len(data))
        list_text.append(data)

Read:  192
Read:  4443
Read:  7184
Read:  5236
Read:  1679
Read:  515
Read:  1965
Read:  518
Read:  7
Read:  1122
Read:  6587
Read:  9048
Read:  3930
Read:  845
Read:  2208
Read:  473
Read:  3216
Read:  8080
Read:  5967
Read:  1658
Read:  166
Read:  493
Read:  4113
Read:  415
Read:  524
Read:  5650
Read:  6448
Read:  651
Read:  1456
Read:  5160
Read:  3107
Read:  4039
Read:  519
Read:  3075
Read:  2505
Read:  2290
Read:  4699
Read:  108
Read:  3676
Read:  183
Read:  1152
Read:  1339
Read:  707
Read:  3930
Read:  3604
Read:  10641
Read:  2179
Read:  4161
Read:  344
Read:  5732
Read:  4082
Read:  5981
Read:  3160
Read:  1916
Read:  8454
Read:  507
Read:  3469
Read:  3012
Read:  1499
Read:  4967
Read:  212
Read:  377
Read:  2344
Read:  2238
Read:  2351
Read:  190
Read:  3796
Read:  318
Read:  2502
Read:  3241
Read:  1481
Read:  3533
Read:  3591
Read:  2002
Read:  3619
Read:  3146
Read:  12802
Read:  3005
Read:  2533
Read:  4008
Read:  4367
Read:  2375
Read:  2191
Read:  4468
Read:  2979
R

Read:  3834
Read:  3496
Read:  4475
Read:  1538
Read:  992
Read:  5983
Read:  1038
Read:  3999
Read:  6734
Read:  4901
Read:  732
Read:  3775
Read:  7743
Read:  13509
Read:  155
Read:  330
Read:  2397
Read:  3157
Read:  2092
Read:  3014
Read:  6534
Read:  2880
Read:  3667
Read:  598
Read:  392
Read:  336
Read:  1609
Read:  2504
Read:  5740
Read:  6754
Read:  139
Read:  399
Read:  4483
Read:  5011
Read:  5067
Read:  929
Read:  1046
Read:  3570
Read:  4680
Read:  3300
Read:  334
Read:  344
Read:  3262
Read:  498
Read:  514
Read:  5526
Read:  403
Read:  5505
Read:  4227
Read:  3492
Read:  2251
Read:  1289
Read:  3237
Read:  2818
Read:  5892
Read:  1090
Read:  4821
Read:  3391
Read:  1098
Read:  2746
Read:  516
Read:  1333
Read:  4351
Read:  2080
Read:  2509
Read:  5314
Read:  6291
Read:  6220
Read:  4158
Read:  2666
Read:  1631
Read:  3269
Read:  2574
Read:  3215
Read:  7132
Read:  4639
Read:  3062
Read:  1145
Read:  375
Read:  3379
Read:  3956
Read:  5267
Read:  429
Read:  2484
Read:  53

Read:  6209
Read:  2455
Read:  2270
Read:  6502
Read:  2765
Read:  466
Read:  2546
Read:  5059
Read:  3391
Read:  3978
Read:  990
Read:  4682
Read:  2126
Read:  4469
Read:  1862
Read:  6123
Read:  3151
Read:  3
Read:  5526
Read:  301
Read:  3388
Read:  2435
Read:  568
Read:  1296
Read:  764
Read:  1960
Read:  2736
Read:  2673
Read:  2469
Read:  1724
Read:  3427
Read:  3
Read:  2010
Read:  3527
Read:  4699
Read:  3
Read:  1898
Read:  3961
Read:  406
Read:  6385
Read:  521
Read:  3
Read:  8701
Read:  3636
Read:  3
Read:  2970
Read:  2572
Read:  3590
Read:  133
Read:  3169
Read:  5680
Read:  3458
Read:  6394
Read:  4199
Read:  5347
Read:  77
Read:  3307
Read:  191
Read:  3131
Read:  512
Read:  2375
Read:  7919
Read:  1268
Read:  3658
Read:  7285
Read:  6145
Read:  336
Read:  4076
Read:  1024
Read:  2727
Read:  382
Read:  2448
Read:  3837
Read:  4029
Read:  487
Read:  5466
Read:  3369
Read:  13704
Read:  1984
Read:  3566
Read:  4961
Read:  9029
Read:  2627
Read:  213
Read:  842
Read:  2189

Read:  3348
Read:  3800
Read:  4203
Read:  5893
Read:  3051
Read:  4579
Read:  1347
Read:  145
Read:  2512
Read:  3006
Read:  3594
Read:  3688
Read:  136
Read:  1273
Read:  2567
Read:  3594
Read:  144
Read:  5504
Read:  4882
Read:  3016
Read:  3060
Read:  4873
Read:  4895
Read:  1996
Read:  2449
Read:  149
Read:  4020
Read:  5726
Read:  2750
Read:  1530
Read:  4374
Read:  2013
Read:  140
Read:  3954
Read:  4706
Read:  2068
Read:  1958
Read:  1624
Read:  4489
Read:  4428
Read:  7830
Read:  2114
Read:  669
Read:  184
Read:  7301
Read:  3672
Read:  642
Read:  6060
Read:  6047
Read:  356
Read:  2765
Read:  3349
Read:  3823
Read:  4778
Read:  4114
Read:  563
Read:  515
Read:  2605
Read:  3307
Read:  512
Read:  3443
Read:  2707
Read:  283
Read:  4170
Read:  1619
Read:  835
Read:  3514
Read:  1079
Read:  2875
Read:  101
Read:  4234
Read:  3689
Read:  4070
Read:  5402
Read:  2962
Read:  3752
Read:  377
Read:  712
Read:  1781
Read:  515
Read:  2868
Read:  3158
Read:  3906
Read:  2971
Read:  240

Read:  3445
Read:  1409
Read:  4709
Read:  5759
Read:  2663
Read:  380
Read:  4797
Read:  3676
Read:  4390
Read:  4395
Read:  3
Read:  1611
Read:  633
Read:  56
Read:  780
Read:  6118
Read:  5738
Read:  319
Read:  737
Read:  6750
Read:  4634
Read:  1304
Read:  1986
Read:  3
Read:  2222
Read:  513
Read:  2935
Read:  390
Read:  3
Read:  4569
Read:  2364
Read:  3453
Read:  517
Read:  1889
Read:  1312
Read:  148
Read:  176
Read:  3164
Read:  325
Read:  3
Read:  7045
Read:  3
Read:  4466
Read:  5189
Read:  4153
Read:  2733
Read:  4601
Read:  373
Read:  2322
Read:  3255
Read:  2900
Read:  3847
Read:  3
Read:  2676
Read:  2834
Read:  311
Read:  2765
Read:  3276
Read:  194
Read:  1402
Read:  3720
Read:  1722
Read:  1840
Read:  3402
Read:  3154
Read:  2752
Read:  2768
Read:  2828
Read:  298
Read:  4081
Read:  2433
Read:  4066
Read:  12802
Read:  4121
Read:  3097
Read:  2935
Read:  4447
Read:  3
Read:  2943
Read:  11332
Read:  184
Read:  3875
Read:  205
Read:  1732
Read:  1698
Read:  2805
Read: 

Read:  3471
Read:  6553
Read:  4424
Read:  4524
Read:  3945
Read:  4940
Read:  111
Read:  3623
Read:  4983
Read:  2723
Read:  334
Read:  4502
Read:  1407
Read:  2319
Read:  953
Read:  3927
Read:  13134
Read:  3
Read:  2757
Read:  5088
Read:  4469
Read:  4305
Read:  3128
Read:  2964
Read:  259
Read:  3831
Read:  5338
Read:  5889
Read:  3443
Read:  2181
Read:  2512
Read:  4365
Read:  1580
Read:  5136
Read:  3642
Read:  6047
Read:  3087
Read:  3549
Read:  2989
Read:  2282
Read:  3
Read:  1781
Read:  431
Read:  3506
Read:  3791
Read:  2804
Read:  209
Read:  2362
Read:  2023
Read:  1974
Read:  3100
Read:  413
Read:  4060
Read:  3672
Read:  4003
Read:  4656
Read:  1421
Read:  914
Read:  1819
Read:  2623
Read:  5123
Read:  392
Read:  4449
Read:  4094
Read:  3
Read:  2963
Read:  4462
Read:  1122
Read:  4736
Read:  4243
Read:  133
Read:  2959
Read:  2345
Read:  4817
Read:  3985
Read:  262
Read:  3024
Read:  3465
Read:  369
Read:  4587
Read:  5305
Read:  7148
Read:  4564
Read:  2082
Read:  3623


Read:  3920
Read:  2504
Read:  6529
Read:  5557
Read:  245
Read:  4034
Read:  3
Read:  1957
Read:  2540
Read:  5700
Read:  2526
Read:  7446
Read:  2620
Read:  3688
Read:  2433
Read:  3364
Read:  4872
Read:  938
Read:  137
Read:  3589
Read:  3874
Read:  1843
Read:  1983
Read:  3484
Read:  228
Read:  251
Read:  1197
Read:  584
Read:  1935
Read:  988
Read:  3640
Read:  8832
Read:  3
Read:  8254
Read:  4593
Read:  4834
Read:  3334
Read:  124
Read:  3
Read:  2592
Read:  3281
Read:  2512
Read:  2439
Read:  3788
Read:  1612
Read:  2565
Read:  1156
Read:  5777
Read:  1011
Read:  3594
Read:  2944
Read:  195
Read:  1698
Read:  1786
Read:  3943
Read:  4663
Read:  7030
Read:  4593
Read:  2879
Read:  1263
Read:  2378
Read:  3
Read:  2790
Read:  5159
Read:  3786
Read:  1725
Read:  2465
Read:  5886
Read:  2651
Read:  5886
Read:  4103
Read:  4649
Read:  16680
Read:  2114
Read:  1311
Read:  2355
Read:  3657
Read:  1390
Read:  966
Read:  2290
Read:  563
Read:  3630
Read:  4180
Read:  1366
Read:  529
Rea

Read:  2330
Read:  3440
Read:  4651
Read:  2115
Read:  1122
Read:  1389
Read:  2933
Read:  515
Read:  433
Read:  53
Read:  2706
Read:  2453
Read:  4020
Read:  6599
Read:  3
Read:  527
Read:  6495
Read:  8176
Read:  1224
Read:  5088
Read:  4134
Read:  3
Read:  2384
Read:  4244
Read:  3012
Read:  2585
Read:  2351
Read:  3349
Read:  9605
Read:  2742
Read:  3663
Read:  2800
Read:  3
Read:  4515
Read:  5081
Read:  3131
Read:  3060
Read:  276
Read:  4442
Read:  247
Read:  4041
Read:  3103
Read:  4158
Read:  620
Read:  10949
Read:  4180
Read:  4872
Read:  1149
Read:  3131
Read:  4446
Read:  1925
Read:  4586
Read:  1767
Read:  5160
Read:  2221
Read:  1780
Read:  2275
Read:  6134
Read:  5211
Read:  2141
Read:  2528
Read:  3314
Read:  537
Read:  5155
Read:  4682
Read:  4463
Read:  3604
Read:  5234
Read:  2144
Read:  4316
Read:  3012
Read:  5160
Read:  515
Read:  1571
Read:  3776
Read:  5908
Read:  1805
Read:  2352
Read:  516
Read:  1061
Read:  3365
Read:  4868
Read:  2742
Read:  2911
Read:  3310

Read:  6451
Read:  4630
Read:  3014
Read:  4158
Read:  3
Read:  2722
Read:  3427
Read:  2068
Read:  3241
Read:  2222
Read:  3443
Read:  447
Read:  4058
Read:  441
Read:  1899
Read:  3573
Read:  340
Read:  3016
Read:  2838
Read:  2727
Read:  5906
Read:  2377
Read:  4350
Read:  4041
Read:  2261
Read:  125
Read:  3
Read:  3763
Read:  185
Read:  375
Read:  3615
Read:  3074
Read:  2642
Read:  2547
Read:  3483
Read:  6139
Read:  3589
Read:  3260
Read:  5814
Read:  5393
Read:  1827
Read:  1918
Read:  3611
Read:  5463
Read:  4900
Read:  1928
Read:  2834
Read:  2855
Read:  3564
Read:  4283
Read:  3768
Read:  8217
Read:  98
Read:  4921
Read:  3845
Read:  7549
Read:  2137
Read:  3215
Read:  3
Read:  4994
Read:  315
Read:  3660
Read:  5091
Read:  3930
Read:  4612
Read:  5320
Read:  3245
Read:  655
Read:  4478
Read:  3036
Read:  2016
Read:  2188
Read:  2812
Read:  5857
Read:  447
Read:  3586
Read:  2201
Read:  1064
Read:  4113
Read:  3066
Read:  2426
Read:  502
Read:  184
Read:  4672
Read:  5338
Re

Read:  1284
Read:  2967
Read:  377
Read:  56
Read:  4857
Read:  3012
Read:  4602
Read:  5790
Read:  1988
Read:  4948
Read:  1929
Read:  4878
Read:  526
Read:  2679
Read:  3902
Read:  909
Read:  4243
Read:  3422
Read:  4607
Read:  3
Read:  410
Read:  4639
Read:  2054
Read:  1928
Read:  1704
Read:  2780
Read:  2171
Read:  4249
Read:  3710
Read:  4278
Read:  4788
Read:  2867
Read:  7797
Read:  4560
Read:  3314
Read:  250
Read:  4730
Read:  3470
Read:  3065
Read:  2224
Read:  2064
Read:  4516
Read:  1796
Read:  1673
Read:  5341
Read:  4138
Read:  2308
Read:  3563
Read:  2545
Read:  3660
Read:  143
Read:  2525
Read:  2481
Read:  1579
Read:  3
Read:  2630
Read:  3
Read:  2030
Read:  2567
Read:  4922
Read:  458
Read:  3119
Read:  3379
Read:  4598
Read:  4796
Read:  2355
Read:  3301
Read:  4963
Read:  4740
Read:  5081
Read:  3159
Read:  3996
Read:  3338
Read:  4077
Read:  5282
Read:  2346
Read:  2693
Read:  2684
Read:  4370
Read:  3
Read:  979
Read:  151
Read:  1299
Read:  3167
Read:  3515
Rea

Read:  499
Read:  4154
Read:  2703
Read:  3997
Read:  4448
Read:  172
Read:  3052
Read:  5619
Read:  2167
Read:  403
Read:  1799
Read:  3590
Read:  1394
Read:  4650
Read:  108
Read:  3287
Read:  2631
Read:  1257
Read:  1157
Read:  4388
Read:  6659
Read:  354
Read:  3060
Read:  364
Read:  516
Read:  4346
Read:  3916
Read:  4321
Read:  3763
Read:  5070
Read:  4259
Read:  4255
Read:  2530
Read:  2967
Read:  3544
Read:  258
Read:  420
Read:  10445
Read:  3585
Read:  5290
Read:  3847
Read:  5758
Read:  3740
Read:  2433
Read:  2926
Read:  5584
Read:  3941
Read:  11083
Read:  5182
Read:  3748
Read:  4911
Read:  12664
Read:  1647
Read:  7040
Read:  111
Read:  11252
Read:  3902
Read:  5976
Read:  2195
Read:  4403
Read:  1669
Read:  7201
Read:  6861
Read:  1688
Read:  4752
Read:  3478
Read:  3538
Read:  2056
Read:  616
Read:  4938
Read:  1391
Read:  4654
Read:  3687
Read:  1680
Read:  320
Read:  528
Read:  3740
Read:  4069
Read:  1271
Read:  2546
Read:  3
Read:  4636
Read:  2583
Read:  2037
Read

Read:  5484
Read:  2126
Read:  2789
Read:  3544
Read:  3538
Read:  3453
Read:  2595
Read:  4646
Read:  5063
Read:  1641
Read:  3
Read:  2095
Read:  1948
Read:  1072
Read:  360
Read:  4935
Read:  4143
Read:  1505
Read:  2284
Read:  5841
Read:  4305
Read:  4656
Read:  4180
Read:  3722
Read:  2705
Read:  4071
Read:  4410
Read:  3503
Read:  2119
Read:  4581
Read:  520
Read:  137
Read:  8565
Read:  9029
Read:  2770
Read:  2572
Read:  3578
Read:  3047
Read:  6118
Read:  1383
Read:  4056
Read:  3244
Read:  2891
Read:  5892
Read:  4267
Read:  2054
Read:  3227
Read:  4521
Read:  5908
Read:  1764
Read:  3575
Read:  3072
Read:  3974
Read:  1283
Read:  2063
Read:  5635
Read:  3005
Read:  1773
Read:  1868
Read:  455
Read:  1932
Read:  632
Read:  2961
Read:  610
Read:  153
Read:  3821
Read:  156
Read:  3138
Read:  4962
Read:  4331
Read:  2405
Read:  4658
Read:  4818
Read:  3761
Read:  4170
Read:  3648
Read:  6880
Read:  5194
Read:  6293
Read:  7132
Read:  4057
Read:  5316
Read:  2902
Read:  6032
Rea

Read:  4567
Read:  8005
Read:  2862
Read:  133
Read:  2314
Read:  4861
Read:  7045
Read:  1188
Read:  3896
Read:  2082
Read:  1368
Read:  5953
Read:  2636
Read:  7496
Read:  2662
Read:  3939
Read:  3316
Read:  465
Read:  4011
Read:  6928
Read:  499
Read:  7446
Read:  2575
Read:  1653
Read:  3785
Read:  2753
Read:  7919
Read:  284
Read:  2013
Read:  2659
Read:  4176
Read:  1174
Read:  4573
Read:  3193
Read:  791
Read:  667
Read:  4063
Read:  2796
Read:  2647
Read:  102
Read:  4107
Read:  2601
Read:  1515
Read:  1900
Read:  2363
Read:  5096
Read:  3128
Read:  11252
Read:  1336
Read:  4873
Read:  2911
Read:  3361
Read:  2477
Read:  2217
Read:  3568
Read:  2849
Read:  5462
Read:  6931
Read:  3337
Read:  1059
Read:  511
Read:  138
Read:  3759
Read:  495
Read:  4849
Read:  1093
Read:  2540
Read:  2046
Read:  3880
Read:  1072
Read:  3692
Read:  7005
Read:  1368
Read:  332
Read:  2031
Read:  5078
Read:  13509
Read:  610
Read:  2486
Read:  4243
Read:  3411
Read:  5081
Read:  96
Read:  4888
Read

Read:  2675
Read:  3919
Read:  2556
Read:  1978
Read:  5383
Read:  2927
Read:  4369
Read:  11212
Read:  308
Read:  2960
Read:  3740
Read:  2144
Read:  2016
Read:  2444
Read:  3577
Read:  2137
Read:  3244
Read:  2884
Read:  3393
Read:  1698
Read:  3720
Read:  5199
Read:  4344
Read:  4156
Read:  2496
Read:  3286
Read:  498
Read:  2790
Read:  3301
Read:  2201
Read:  3946
Read:  4558
Read:  5284
Read:  520
Read:  3
Read:  1112
Read:  4099
Read:  3610
Read:  392
Read:  3296
Read:  4180
Read:  3015
Read:  3
Read:  359
Read:  2540
Read:  3564
Read:  3012
Read:  3050
Read:  6854
Read:  268
Read:  3651
Read:  3
Read:  3100
Read:  4436
Read:  2434
Read:  2254
Read:  672
Read:  177
Read:  4056
Read:  187
Read:  513
Read:  5514
Read:  1178
Read:  3792
Read:  2727
Read:  2377
Read:  514
Read:  2071
Read:  493
Read:  4496
Read:  2845
Read:  826
Read:  3
Read:  4245
Read:  5189
Read:  1418
Read:  4180
Read:  5057
Read:  2463
Read:  3124
Read:  264
Read:  4125
Read:  1821
Read:  420
Read:  5000
Read: 

Read:  5790
Read:  3702
Read:  816
Read:  346
Read:  1699
Read:  4426
Read:  1274
Read:  191
Read:  2811
Read:  3590
Read:  3016
Read:  5627
Read:  5995
Read:  2159
Read:  6166
Read:  3
Read:  5407
Read:  3995
Read:  521
Read:  821
Read:  3300
Read:  4506
Read:  5282
Read:  2907
Read:  4237
Read:  2334
Read:  2953
Read:  3307
Read:  2333
Read:  3633
Read:  325
Read:  869
Read:  3059
Read:  2829
Read:  5486
Read:  3436
Read:  4179
Read:  2040
Read:  4796
Read:  5749
Read:  147
Read:  4867
Read:  4817
Read:  642
Read:  1647
Read:  160
Read:  2169
Read:  3590
Read:  296
Read:  4087
Read:  516
Read:  2949
Read:  1156
Read:  3872
Read:  2774
Read:  4282
Read:  3453
Read:  790
Read:  263
Read:  2227
Read:  3929
Read:  3448
Read:  1195
Read:  3347
Read:  3445
Read:  4155
Read:  4153
Read:  12802
Read:  5017
Read:  2690
Read:  3348
Read:  4283
Read:  2975
Read:  504
Read:  4066
Read:  1340
Read:  3097
Read:  4658
Read:  5571
Read:  1922
Read:  27447
Read:  753
Read:  3
Read:  6491
Read:  4033


Read:  3071
Read:  5913
Read:  5282
Read:  3259
Read:  2983
Read:  3753
Read:  2333
Read:  4458
Read:  514
Read:  4119
Read:  3
Read:  4251
Read:  3720
Read:  3173
Read:  2024
Read:  4366
Read:  4041
Read:  3414
Read:  3586
Read:  4121
Read:  184
Read:  7046
Read:  2591
Read:  3907
Read:  459
Read:  3
Read:  1613
Read:  7887
Read:  1217
Read:  2793
Read:  579
Read:  246
Read:  4249
Read:  6265
Read:  1376
Read:  4375
Read:  4699
Read:  268
Read:  2679
Read:  8799
Read:  6602
Read:  6018
Read:  2691
Read:  6933
Read:  4340
Read:  4143
Read:  3753
Read:  2384
Read:  2838
Read:  397
Read:  335
Read:  3453
Read:  477
Read:  3539
Read:  5409
Read:  3941
Read:  473
Read:  2670
Read:  5921
Read:  4305
Read:  4673
Read:  3615
Read:  132
Read:  174
Read:  3903
Read:  3445
Read:  4316
Read:  2181
Read:  2757
Read:  5863
Read:  6520
Read:  3986
Read:  3993
Read:  3361
Read:  2715
Read:  1341
Read:  3671
Read:  3748
Read:  4243
Read:  2470
Read:  1456
Read:  372
Read:  509
Read:  3802
Read:  3308


Read:  3659
Read:  5090
Read:  524
Read:  1333
Read:  3843
Read:  1203
Read:  192
Read:  4683
Read:  145
Read:  4183
Read:  3818
Read:  5096
Read:  3616
Read:  4426
Read:  5102
Read:  3864
Read:  8404
Read:  1437
Read:  4397
Read:  3
Read:  350
Read:  632
Read:  4293
Read:  2414
Read:  5913
Read:  3365
Read:  2617
Read:  3885
Read:  4900
Read:  3039
Read:  3
Read:  5829
Read:  6423
Read:  238
Read:  947
Read:  2356
Read:  1944
Read:  3075
Read:  3614
Read:  709
Read:  1917
Read:  1061
Read:  135
Read:  3132
Read:  2221
Read:  741
Read:  8164
Read:  4774
Read:  5260
Read:  6557
Read:  142
Read:  4666
Read:  1814
Read:  284
Read:  1515
Read:  336
Read:  2516
Read:  2187
Read:  3906
Read:  1726
Read:  5486
Read:  514
Read:  3726
Read:  3
Read:  518
Read:  4287
Read:  472
Read:  2662
Read:  1869
Read:  191
Read:  5088
Read:  590
Read:  3
Read:  4568
Read:  3590
Read:  1214
Read:  133
Read:  7937
Read:  315
Read:  3703
Read:  4976
Read:  3975
Read:  2253
Read:  2314
Read:  5084
Read:  2180


Read:  2895
Read:  3371
Read:  5268
Read:  5457
Read:  4800
Read:  1244
Read:  1422
Read:  2148
Read:  891
Read:  584
Read:  3348
Read:  4418
Read:  2768
Read:  193
Read:  2760
Read:  2405
Read:  2885
Read:  1214
Read:  3651
Read:  412
Read:  4801
Read:  494
Read:  8204
Read:  173
Read:  418
Read:  1292
Read:  4160
Read:  2113
Read:  8701
Read:  9270
Read:  3760
Read:  1790
Read:  269
Read:  737
Read:  2447
Read:  4251
Read:  4788
Read:  3597
Read:  5816
Read:  3
Read:  872
Read:  4376
Read:  163
Read:  3290
Read:  1105
Read:  6797
Read:  149
Read:  7324
Read:  1399
Read:  4254
Read:  2567
Read:  15
Read:  1159
Read:  6396
Read:  4911
Read:  2229
Read:  3
Read:  6341
Read:  6659
Read:  2055
Read:  4097
Read:  100
Read:  3917
Read:  5221
Read:  451
Read:  1780
Read:  3314
Read:  177
Read:  1498
Read:  2105
Read:  408
Read:  5268
Read:  2873
Read:  5556
Read:  2378
Read:  3
Read:  1383
Read:  3513
Read:  3102
Read:  2918
Read:  5547
Read:  4706
Read:  354
Read:  4883
Read:  6105
Read:  1

Read:  1652
Read:  2068
Read:  3077
Read:  411
Read:  3144
Read:  2665
Read:  226
Read:  4963
Read:  2532
Read:  3184
Read:  512
Read:  3
Read:  3124
Read:  3
Read:  3507
Read:  131
Read:  4549
Read:  8018
Read:  521
Read:  3102
Read:  3757
Read:  2388
Read:  2682
Read:  2346
Read:  4783
Read:  1352
Read:  3119
Read:  4491
Read:  4521
Read:  4702
Read:  422
Read:  8547
Read:  3529
Read:  3359
Read:  857
Read:  111
Read:  1637
Read:  2469
Read:  2981
Read:  2918
Read:  2183
Read:  277
Read:  7489
Read:  4972
Read:  2836
Read:  2523
Read:  1913
Read:  1925
Read:  4635
Read:  1827
Read:  506
Read:  737
Read:  467
Read:  1569
Read:  3980
Read:  3006
Read:  5112
Read:  3126
Read:  936
Read:  1303
Read:  1174
Read:  5584
Read:  2611
Read:  3653
Read:  4692
Read:  1020
Read:  6737
Read:  1340
Read:  4305
Read:  2351
Read:  3418
Read:  2727
Read:  3081
Read:  169
Read:  320
Read:  1915
Read:  3615
Read:  317
Read:  5212
Read:  450
Read:  4627
Read:  241
Read:  4376
Read:  3137
Read:  335
Read:

Read:  387
Read:  1061
Read:  3223
Read:  2242
Read:  3100
Read:  5985
Read:  1594
Read:  4021
Read:  3
Read:  503
Read:  492
Read:  3
Read:  3
Read:  516
Read:  170
Read:  3100
Read:  6754
Read:  5701
Read:  2657
Read:  4207
Read:  1185
Read:  116
Read:  4601
Read:  2392
Read:  2308
Read:  7
Read:  2756
Read:  2320
Read:  6766
Read:  357
Read:  4623
Read:  2197
Read:  5518
Read:  1319
Read:  598
Read:  1333
Read:  3902
Read:  3310
Read:  3
Read:  378
Read:  3792
Read:  516
Read:  473
Read:  4043
Read:  4958
Read:  2092
Read:  8466
Read:  1151
Read:  3642
Read:  6626
Read:  4305
Read:  1163
Read:  139
Read:  1028
Read:  4188
Read:  135
Read:  3186
Read:  3344
Read:  1297
Read:  6230
Read:  10001
Read:  2640
Read:  2774
Read:  5144
Read:  2253
Read:  2839
Read:  2485
Read:  1902
Read:  3466
Read:  3415
Read:  2238
Read:  4643
Read:  3144
Read:  2275
Read:  3386
Read:  2816
Read:  1631
Read:  5553
Read:  186
Read:  1971
Read:  1618
Read:  5219
Read:  3495
Read:  3646
Read:  4159
Read:  1

Read:  3445
Read:  4081
Read:  4176
Read:  3153
Read:  3872
Read:  237
Read:  947
Read:  5905
Read:  2311
Read:  5221
Read:  5504
Read:  6211
Read:  3873
Read:  1778
Read:  4093
Read:  2964
Read:  682
Read:  1085
Read:  4860
Read:  3233
Read:  2290
Read:  5879
Read:  3301
Read:  6605
Read:  4837
Read:  171
Read:  141
Read:  5100
Read:  3068
Read:  4058
Read:  11413
Read:  3198
Read:  440
Read:  737
Read:  4066
Read:  1199
Read:  156
Read:  4057
Read:  1460
Read:  4745
Read:  4165
Read:  4419
Read:  2894
Read:  2616
Read:  1476
Read:  4240
Read:  2820
Read:  4723
Read:  5767
Read:  2059
Read:  2070
Read:  411
Read:  8361
Read:  5210
Read:  5040
Read:  6924
Read:  4382
Read:  4205
Read:  1866
Read:  6703
Read:  1295
Read:  3
Read:  516
Read:  3375
Read:  2184
Read:  251
Read:  3724
Read:  4810
Read:  104
Read:  2156
Read:  192
Read:  1419
Read:  3943
Read:  4372
Read:  3833
Read:  3074
Read:  6205
Read:  5149
Read:  3727
Read:  704
Read:  5788
Read:  2616
Read:  3973
Read:  724
Read:  45

Read:  2422
Read:  1280
Read:  1917
Read:  2416
Read:  3496
Read:  5208
Read:  3784
Read:  3004
Read:  5323
Read:  1057
Read:  3362
Read:  1437
Read:  3216
Read:  3551
Read:  479
Read:  1486
Read:  521
Read:  1764
Read:  1789
Read:  3
Read:  4556
Read:  1938
Read:  519
Read:  1827
Read:  2945
Read:  421
Read:  1096
Read:  99
Read:  4066
Read:  508
Read:  4054
Read:  4134
Read:  2535
Read:  6735
Read:  4781
Read:  2941
Read:  4253
Read:  1879
Read:  526
Read:  1011
Read:  1567
Read:  1455
Read:  5901
Read:  4051
Read:  170
Read:  1065
Read:  3096
Read:  2694
Read:  2418
Read:  610
Read:  501
Read:  2226
Read:  3374
Read:  4011
Read:  6095
Read:  425
Read:  5181
Read:  2141
Read:  2393
Read:  1117
Read:  3586
Read:  323
Read:  2294
Read:  5281
Read:  4639
Read:  2524
Read:  521
Read:  113
Read:  5488
Read:  2253
Read:  1987
Read:  2869
Read:  2912
Read:  3532
Read:  5788
Read:  3455
Read:  1188
Read:  3893
Read:  1195
Read:  3839
Read:  3145
Read:  3642
Read:  6164
Read:  2489
Read:  234

Read:  2476
Read:  316
Read:  517
Read:  1175
Read:  8769
Read:  7145
Read:  3367
Read:  2876
Read:  3825
Read:  5221
Read:  5606
Read:  3113
Read:  3125
Read:  171
Read:  3320
Read:  307
Read:  1413
Read:  191
Read:  1000
Read:  2595
Read:  3513
Read:  5338
Read:  2729
Read:  523
Read:  531
Read:  3
Read:  253
Read:  6105
Read:  5329
Read:  525
Read:  3867
Read:  6358
Read:  3024
Read:  134
Read:  3307
Read:  5917
Read:  1693
Read:  134
Read:  449
Read:  4361
Read:  1991
Read:  3014
Read:  3337
Read:  4175
Read:  567
Read:  2504
Read:  2022
Read:  2297
Read:  676
Read:  2927
Read:  3790
Read:  3286
Read:  3646
Read:  710
Read:  75
Read:  1437
Read:  487
Read:  3308
Read:  4176
Read:  477
Read:  3096
Read:  3486
Read:  2170
Read:  3805
Read:  2574
Read:  12099
Read:  1755
Read:  2611
Read:  1610
Read:  3843
Read:  3196
Read:  1703
Read:  1300
Read:  2187
Read:  3052
Read:  4854
Read:  3143
Read:  2798
Read:  4031
Read:  741
Read:  360
Read:  251
Read:  411
Read:  2605
Read:  2930
Read:

Read:  3
Read:  3
Read:  517
Read:  4730
Read:  521
Read:  2549
Read:  2196
Read:  3
Read:  7446
Read:  4081
Read:  1929
Read:  4861
Read:  3484
Read:  3301
Read:  494
Read:  3268
Read:  2411
Read:  2713
Read:  7103
Read:  3247
Read:  3802
Read:  3278
Read:  1084
Read:  3720
Read:  2790
Read:  289
Read:  3
Read:  3882
Read:  3910
Read:  2447
Read:  5611
Read:  3143
Read:  3920
Read:  3
Read:  346
Read:  3338
Read:  319
Read:  5245
Read:  3
Read:  3
Read:  1799
Read:  1958
Read:  3385
Read:  3299
Read:  1727
Read:  4120
Read:  3315
Read:  3906
Read:  3978
Read:  4153
Read:  5254
Read:  283
Read:  2345
Read:  4306
Read:  7288
Read:  3257
Read:  659
Read:  256
Read:  3975
Read:  3733
Read:  3041
Read:  1226
Read:  2259
Read:  5825
Read:  6412
Read:  4658
Read:  2112
Read:  2651
Read:  1818
Read:  5209
Read:  5199
Read:  3953
Read:  3352
Read:  3
Read:  8544
Read:  3
Read:  315
Read:  2217
Read:  14140
Read:  1154
Read:  5456
Read:  3
Read:  3979
Read:  1616
Read:  4836
Read:  3479
Read:  

Read:  3263
Read:  490
Read:  3372
Read:  4669
Read:  5457
Read:  3077
Read:  6265
Read:  139
Read:  3
Read:  2843
Read:  90
Read:  4405
Read:  6639
Read:  2063
Read:  4621
Read:  5866
Read:  7826
Read:  394
Read:  3642
Read:  4882
Read:  441
Read:  513
Read:  2735
Read:  10667
Read:  3864
Read:  3831
Read:  4959
Read:  2620
Read:  3592
Read:  339
Read:  3
Read:  4922
Read:  1338
Read:  7132
Read:  4172
Read:  3394
Read:  567
Read:  191
Read:  4880
Read:  440
Read:  155
Read:  2575
Read:  3983
Read:  2279
Read:  5740
Read:  135
Read:  676
Read:  2202
Read:  4317
Read:  3548
Read:  2794
Read:  1014
Read:  6251
Read:  5174
Read:  4503
Read:  3668
Read:  7919
Read:  3079
Read:  4305
Read:  2628
Read:  3193
Read:  3169
Read:  1873
Read:  4374
Read:  1852
Read:  3586
Read:  5806
Read:  2770
Read:  187
Read:  3331
Read:  763
Read:  4628
Read:  4907
Read:  3766
Read:  5698
Read:  419
Read:  2601
Read:  4604
Read:  2463
Read:  2636
Read:  486
Read:  2753
Read:  3068
Read:  2476
Read:  484
Read

Read:  2449
Read:  2829
Read:  2946
Read:  2872
Read:  196
Read:  880
Read:  2888
Read:  2041
Read:  4182
Read:  2090
Read:  6363
Read:  2299
Read:  10204
Read:  151
Read:  3000
Read:  342
Read:  3445
Read:  3
Read:  3912
Read:  518
Read:  2787
Read:  1782
Read:  1467
Read:  348
Read:  4783
Read:  3653
Read:  1883
Read:  1893
Read:  277
Read:  4216
Read:  4911
Read:  3
Read:  2679
Read:  482
Read:  3596
Read:  2574
Read:  4317
Read:  5191
Read:  1997
Read:  3
Read:  2739
Read:  4660
Read:  2813
Read:  2476
Read:  3078
Read:  5116
Read:  69
Read:  4965
Read:  5043
Read:  415
Read:  375
Read:  3361
Read:  3843
Read:  4125
Read:  234
Read:  450
Read:  982
Read:  3793
Read:  3021
Read:  6589
Read:  3285
Read:  1621
Read:  2602
Read:  927
Read:  2261
Read:  4379
Read:  1802
Read:  5162
Read:  2739
Read:  3094
Read:  4208
Read:  1757
Read:  3
Read:  2927
Read:  4550
Read:  1712
Read:  3
Read:  5499
Read:  6883
Read:  3994
Read:  3010
Read:  3906
Read:  1471
Read:  4921
Read:  2828
Read:  480

Read:  1702
Read:  5154
Read:  5444
Read:  438
Read:  5329
Read:  18968
Read:  3135
Read:  2346
Read:  5642
Read:  208
Read:  4328
Read:  3738
Read:  1618
Read:  5841
Read:  1156
Read:  2939
Read:  5066
Read:  1478
Read:  3
Read:  187
Read:  3171
Read:  3453
Read:  2912
Read:  2690
Read:  3954
Read:  1617
Read:  142
Read:  4228
Read:  1214
Read:  3300
Read:  2238
Read:  593
Read:  303
Read:  4610
Read:  3461
Read:  3399
Read:  358
Read:  511
Read:  512
Read:  4198
Read:  4121
Read:  8139
Read:  3
Read:  4111
Read:  1245
Read:  4510
Read:  3198
Read:  1953
Read:  609
Read:  4931
Read:  888
Read:  197
Read:  3530
Read:  3671
Read:  2687
Read:  4266
Read:  3161
Read:  2260
Read:  2915
Read:  3800
Read:  247
Read:  4679
Read:  6450
Read:  5286
Read:  3761
Read:  2262
Read:  5346
Read:  422
Read:  7132
Read:  869
Read:  4786
Read:  4018
Read:  2110
Read:  1686
Read:  1656
Read:  1848
Read:  3329
Read:  299
Read:  2211
Read:  6450
Read:  5573
Read:  2910
Read:  5924
Read:  2619
Read:  2244
R

Read:  5059
Read:  3435
Read:  5166
Read:  3284
Read:  523
Read:  3883
Read:  5139
Read:  2260
Read:  1686
Read:  2673
Read:  2201
Read:  255
Read:  4926
Read:  378
Read:  3602
Read:  917
Read:  7734
Read:  2195
Read:  3896
Read:  3844
Read:  151
Read:  3616
Read:  3843
Read:  3650
Read:  988
Read:  3446
Read:  3229
Read:  6928
Read:  2584
Read:  6257
Read:  2250
Read:  10177
Read:  2200
Read:  3808
Read:  3227
Read:  3114
Read:  447
Read:  2854
Read:  5356
Read:  3240
Read:  4158
Read:  5480
Read:  3316
Read:  2990
Read:  2479
Read:  436
Read:  5390
Read:  3
Read:  2341
Read:  2205
Read:  3238
Read:  2857
Read:  513
Read:  4694
Read:  1480
Read:  3618
Read:  1631
Read:  3767
Read:  2095
Read:  2841
Read:  2630
Read:  3930
Read:  5230
Read:  8465
Read:  5209
Read:  3536
Read:  3376
Read:  502
Read:  4709
Read:  2859
Read:  3
Read:  3355
Read:  4185
Read:  4164
Read:  259
Read:  3604
Read:  134
Read:  4469
Read:  1696
Read:  4301
Read:  514
Read:  3877
Read:  438
Read:  507
Read:  4172


Read:  4999
Read:  5165
Read:  2935


In [15]:
len(list_text)

23318

In [34]:
cvr = CountVectorizer(lowercase=True,stop_words='english',decode_error='ignore')
cvr.fit(list_text)

len(cvr.get_feature_names())

90583

In [35]:
X_train = pd.DataFrame(cvr.transform(list_text).todense(),
                       columns=cvec.get_feature_names())

MemoryError: 

In [None]:
X_train.shape

In [16]:
# From 4.23, inClass_PRAC
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        # Give word . Return the fequency of word\n",
        # Exclude STOPWORD, everything very less then 3\n",
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [33]:
data_preprocess = preprocess(list_text)

TypeError: decoding to str: need a bytes-like object, list found

In [5]:
frame['contents'].isna().sum()

10617

In [6]:
frame.drop(labels=['tags','contents.1','tags.1','author.1'], axis=1, inplace=True)
frame.head()

Unnamed: 0,author,contents,date,title,tweetid,url
0,,"It's a warm afternoon in Miami, and 35-year-ol...",Mon Apr 06 16:13:15 +0000 2015,RT @JennyAGold: Maybe You Should Skip That Ann...,5.851130446210662e+17,http://www.npr.org/blogs/health/2015/04/06/397...
1,,"California's health insurance marketplace, Cov...",Wed Dec 24 18:40:33 +0000 2014,Would You Like Health Insurance With Those Sto...,5.47824165185536e+17,http://n.pr/1CKulog
2,,"On Tuesday, the Food and Drug Administration r...",Wed Dec 24 14:41:04 +0000 2014,New Blood Donation Rules Would Still Exclude M...,5.477638976381993e+17,http://n.pr/1CJ9IsG
3,,"This time last year, federal officials were sc...",Wed Dec 24 08:34:09 +0000 2014,Obama Administration Downplays Court Challenge...,5.476715604718592e+17,http://n.pr/1CHaDd6
4,,"Ebola has cast a shadow over Liberia, but it c...",Wed Dec 24 08:34:08 +0000 2014,"Christmas In Liberia: Ebola Fears, No Snow, Ho...",5.47671553609982e+17,http://n.pr/1CHaAxQ


In [7]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19989 entries, 0 to 19988
Data columns (total 6 columns):
author      4412 non-null object
contents    9372 non-null object
date        19989 non-null object
title       19989 non-null object
tweetid     19989 non-null object
url         17521 non-null object
dtypes: object(6)
memory usage: 937.1+ KB


In [8]:
frame['date'] = pd.to_datetime(frame['date'])

ValueError: ('Unknown string format:', 'date')

In [14]:
frame.fillna('Blank')

0        It's a warm afternoon in Miami, and 35-year-ol...
1        California's health insurance marketplace, Cov...
2        On Tuesday, the Food and Drug Administration r...
3        This time last year, federal officials were sc...
4        Ebola has cast a shadow over Liberia, but it c...
5        Every week, Dr. Michael Poshkus visits the Joh...
6        Imagine you live on a floating lake house. Ope...
7        We know that you can be fat while still fit, b...
8        We have received a bunch of questions about en...
9        Garrett Peterson was born in 2012 with a defec...
10       If your cardiologist is away at a conference w...
11       A new study says that the mortality risk for p...
12       Compared with other primates and our early hum...
13       For a few weeks last year, Michael Tranfaglia ...
14       Chronic pain affects tens of millions of Ameri...
15       I'm a proud nurse from a proud family of nurse...
16       I'm a proud nurse from a proud family of nurse.

In [8]:
frame.iloc[59]

author                                 NaN
contents                               NaN
date        Sun Dec 14 20:05:17 +0000 2014
title                              Health 
tweetid              5.442216097419346e+17
url                    http://n.pr/1BDX9hO
Name: 59, dtype: object

In [16]:
dest_url = ''
# Write to file txt
file_names = frame['tweetid'].values

for key, name in enumerate(file_names[:100]):
    
    # Construct the file name
    located_file = './training/' + str(name) + '.txt'
    
    
    
    try:
        # Open file for writing
        f= open(located_file,"w+")
        
        # Writing on file
        contents = str(frame['contents'].iloc[key])
        print(key, ' : ' ,len(contents))
        if contents is not 'blank':
            f.write(contents)
        else:
            print(len(contents))

        # close file connection
        f.close()
    except:
        pass

0  :  <class 'str'>
1  :  <class 'str'>
2  :  <class 'str'>
3  :  <class 'str'>
4  :  <class 'str'>
5  :  <class 'str'>
6  :  <class 'str'>
7  :  <class 'str'>
8  :  <class 'str'>
9  :  <class 'str'>
10  :  <class 'str'>
11  :  <class 'str'>
12  :  <class 'str'>
13  :  <class 'str'>
14  :  <class 'str'>
15  :  <class 'str'>
16  :  <class 'str'>
17  :  <class 'str'>
18  :  <class 'str'>
19  :  <class 'str'>
20  :  <class 'str'>
21  :  <class 'str'>
22  :  <class 'str'>
23  :  <class 'str'>
24  :  <class 'str'>
25  :  <class 'str'>
26  :  <class 'str'>
27  :  <class 'str'>
28  :  <class 'str'>
29  :  <class 'str'>
30  :  <class 'str'>
31  :  <class 'str'>
32  :  <class 'str'>
33  :  <class 'str'>
34  :  <class 'str'>
35  :  <class 'str'>
36  :  <class 'str'>
37  :  <class 'str'>
38  :  <class 'str'>
39  :  <class 'str'>
40  :  <class 'str'>
41  :  <class 'str'>
42  :  <class 'str'>
43  :  <class 'str'>
44  :  <class 'str'>
45  :  <class 'str'>
46  :  <class 'str'>
47  :  <class 'str'>
48

In [None]:
frame.info()

In [None]:
df = frame[['date']].copy()
df.head()

In [None]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.strftime('%b')
df['weekday'] = df['date'].dt.strftime('%A')
df['hour'] = df['date'].dt.hour
df['minutes'] = df['date'].dt.strftime('%M')

In [None]:
df.head()

In [None]:
df['weekday'].value_counts()

In [None]:
df['month'].value_counts()

In [None]:
df['hour'].value_counts().sort_index()

In [None]:
test_txt = """
Students were urged to make sure their vaccinations were up to date following reports of cases of mumps at Cardiff Metropolitan University in April.

Last year, Public Health Wales asked parents to get their children vaccinated after outbreaks of measles around south east Wales.

The report also looks at the threat from resistance to antibiotics.

Global overuse of antibiotics has led to a reduction in their effectiveness, as bacteria grow immune to them.

The report says this has already led to a small number of difficult to treat infections, leading to "failed therapy and potential complications".

A focus on reducing antibiotic use in GPs' surgeries saw a fall of nearly 12% in their use over five years, the report added.

"""

tnlp = NLP(test_txt)
tnlp.tokenizer()

tnlp.stemming()